diff options
Diffstat (limited to 'arch/x86/kernel')
188 files changed, 11904 insertions, 18484 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 74077694da7d..0d2a6d953be9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) @@ -17,7 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg CFLAGS_REMOVE_head32.o = -pg -CFLAGS_REMOVE_sev.o = -pg CFLAGS_REMOVE_rethook.o = -pg endif @@ -26,21 +25,28 @@ KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n KASAN_SANITIZE_stacktrace.o := n KASAN_SANITIZE_paravirt.o := n -KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n KMSAN_SANITIZE_head$(BITS).o := n KMSAN_SANITIZE_nmi.o := n -KMSAN_SANITIZE_sev.o := n # If instrumentation of the following files is enabled, boot hangs during # first second. KCOV_INSTRUMENT_head$(BITS).o := n -KCOV_INSTRUMENT_sev.o := n - -CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace +# These are called from save_stack_trace() on debug paths, +# and produce large amounts of uninteresting coverage. +KCOV_INSTRUMENT_stacktrace.o := n +KCOV_INSTRUMENT_dumpstack.o := n +KCOV_INSTRUMENT_dumpstack_$(BITS).o := n +KCOV_INSTRUMENT_unwind_orc.o := n +KCOV_INSTRUMENT_unwind_frame.o := n +KCOV_INSTRUMENT_unwind_guess.o := n + +CFLAGS_head32.o := -fno-stack-protector +CFLAGS_head64.o := -fno-stack-protector +CFLAGS_irq.o := -I $(src)/../include/asm/trace obj-y += head_$(BITS).o obj-y += head$(BITS).o @@ -62,7 +68,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o obj-y += resource.o @@ -115,6 +121,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_AMD_NODE) += amd_node.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o @@ -134,7 +141,6 @@ obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o -obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o obj-$(CONFIG_X86_UMIP) += umip.o @@ -142,8 +148,6 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o - obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index fc17b3f136fe..842a5f449404 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o +obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4bf82dbd2a6b..9fa321a95eb3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -23,6 +23,8 @@ #include <linux/serial_core.h> #include <linux/pgtable.h> +#include <xen/xen.h> + #include <asm/e820/api.h> #include <asm/irqdomain.h> #include <asm/pci_x86.h> @@ -67,13 +69,6 @@ static bool has_lapic_cpus __initdata; static bool acpi_support_online_capable; #endif -#ifdef CONFIG_X86_64 -/* Physical address of the Multiprocessor Wakeup Structure mailbox */ -static u64 acpi_mp_wake_mailbox_paddr; -/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ -static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; -#endif - #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -234,6 +229,28 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) } static int __init +acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + /* Ignore invalid ID */ + if (processor->id == 0xff) + return 0; + + /* Ignore processors that can not be onlined */ + if (!acpi_is_processor_usable(processor->lapic_flags)) + return 0; + + has_lapic_cpus = true; + return 0; +} + +static int __init acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) { struct acpi_madt_local_apic *processor = NULL; @@ -264,7 +281,6 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); - has_lapic_cpus = true; return 0; } @@ -341,60 +357,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } - -#ifdef CONFIG_X86_64 -static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) -{ - /* - * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). - * - * Wakeup of secondary CPUs is fully serialized in the core code. - * No need to protect acpi_mp_wake_mailbox from concurrent accesses. - */ - if (!acpi_mp_wake_mailbox) { - acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, - sizeof(*acpi_mp_wake_mailbox), - MEMREMAP_WB); - } - - /* - * Mailbox memory is shared between the firmware and OS. Firmware will - * listen on mailbox command address, and once it receives the wakeup - * command, the CPU associated with the given apicid will be booted. - * - * The value of 'apic_id' and 'wakeup_vector' must be visible to the - * firmware before the wakeup command is visible. smp_store_release() - * ensures ordering and visibility. - */ - acpi_mp_wake_mailbox->apic_id = apicid; - acpi_mp_wake_mailbox->wakeup_vector = start_ip; - smp_store_release(&acpi_mp_wake_mailbox->command, - ACPI_MP_WAKE_COMMAND_WAKEUP); - - /* - * Wait for the CPU to wake up. - * - * The CPU being woken up is essentially in a spin loop waiting to be - * woken up. It should not take long for it wake up and acknowledge by - * zeroing out ->command. - * - * ACPI specification doesn't provide any guidance on how long kernel - * has to wait for a wake up acknowledgement. It also doesn't provide - * a way to cancel a wake up request if it takes too long. - * - * In TDX environment, the VMM has control over how long it takes to - * wake up secondary. It can postpone scheduling secondary vCPU - * indefinitely. Giving up on wake up request and reporting error opens - * possible attack vector for VMM: it can wake up a secondary CPU when - * kernel doesn't expect it. Wait until positive result of the wake up - * request. - */ - while (READ_ONCE(acpi_mp_wake_mailbox->command)) - cpu_relax(); - - return 0; -} -#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -972,11 +934,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * the resource tree during the lateinit timeframe. */ #define HPET_RESOURCE_NAME_SIZE 9 - hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, SMP_CACHE_BYTES); - if (!hpet_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; @@ -1090,6 +1049,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) static int __init acpi_parse_madt_lapic_entries(void) { int count, x2count = 0; + struct acpi_subtable_proc madt_proc[2]; + int ret; if (!boot_cpu_has(X86_FEATURE_APIC)) return -ENODEV; @@ -1098,10 +1059,27 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { - count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_LOCAL_APIC); - x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_LOCAL_APIC); + /* Check if there are valid LAPIC entries */ + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC); + + /* + * Enumerate the APIC IDs in the order that they appear in the + * MADT, no matter LAPIC entry or x2APIC entry is used. + */ + memset(madt_proc, 0, sizeof(madt_proc)); + madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC; + madt_proc[0].handler = acpi_parse_lapic; + madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC; + madt_proc[1].handler = acpi_parse_x2apic; + ret = acpi_table_parse_entries_array(ACPI_SIG_MADT, + sizeof(struct acpi_table_madt), + madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC); + if (ret < 0) { + pr_err("Error parsing LAPIC/X2APIC entries\n"); + return ret; + } + count = madt_proc[0].count; + x2count = madt_proc[1].count; } if (!count && !x2count) { pr_err("No LAPIC entries present\n"); @@ -1124,29 +1102,6 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } - -#ifdef CONFIG_X86_64 -static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, - const unsigned long end) -{ - struct acpi_madt_multiproc_wakeup *mp_wake; - - if (!IS_ENABLED(CONFIG_SMP)) - return -ENODEV; - - mp_wake = (struct acpi_madt_multiproc_wakeup *)header; - if (BAD_MADT_ENTRY(mp_wake, end)) - return -EINVAL; - - acpi_table_print_madt_entry(&header->common); - - acpi_mp_wake_mailbox_paddr = mp_wake->base_address; - - apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); - - return 0; -} -#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1255,7 +1210,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) } count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, - acpi_parse_int_src_ovr, nr_irqs); + acpi_parse_int_src_ovr, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing interrupt source overrides entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1275,7 +1231,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) mp_config_acpi_legacy_irqs(); count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, - acpi_parse_nmi_src, nr_irqs); + acpi_parse_nmi_src, + irq_get_nr_irqs()); if (count < 0) { pr_err("Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ @@ -1343,7 +1300,7 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } -#ifdef CONFIG_X86_64 +#ifdef CONFIG_ACPI_MADT_WAKEUP /* * Parse MADT MP Wake entry. */ @@ -1774,6 +1731,15 @@ int __init acpi_mps_check(void) { #if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) /* mptable code is not built-in*/ + + /* + * Xen disables ACPI in PV DomU guests but it still emulates APIC and + * supports SMP. Returning early here ensures that APIC is not disabled + * unnecessarily and the guest is not limited to a single vCPU. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return 0; + if (acpi_disabled || acpi_noirq) { pr_warn("MPS support code is not built-in, using acpi=off or acpi=noirq or pci=noacpi may have problem\n"); return 1; @@ -1862,3 +1828,14 @@ u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; } + +#ifdef CONFIG_XEN_PV +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + return ioremap_cache(phys, size); +} + +void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) = + x86_acpi_os_ioremap; +EXPORT_SYMBOL_GPL(acpi_os_ioremap); +#endif diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index ff8f25faca3d..7047124490f6 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -4,11 +4,24 @@ * Copyright (c) 2016, Intel Corporation. */ +#include <linux/bitfield.h> + #include <acpi/cppc_acpi.h> #include <asm/msr.h> #include <asm/processor.h> #include <asm/topology.h> +#define CPPC_HIGHEST_PERF_PERFORMANCE 196 +#define CPPC_HIGHEST_PERF_PREFCORE 166 + +enum amd_pref_core { + AMD_PREF_CORE_UNKNOWN = 0, + AMD_PREF_CORE_SUPPORTED, + AMD_PREF_CORE_UNSUPPORTED, +}; +static enum amd_pref_core amd_pref_core_detected; +static u64 boost_numerator; + /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ bool cpc_supported_by_cpu(void) @@ -36,7 +49,7 @@ int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) { int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -52,7 +65,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) u64 rd_val; int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -61,7 +74,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) val &= mask; rd_val &= ~mask; rd_val |= val; - err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val); } return err; } @@ -69,38 +82,37 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; - u64 highest_perf, nominal_perf; + u64 numerator, nominal_perf; u64 perf_ratio; int rc; rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { - pr_debug("Could not retrieve perf counters (%d)\n", rc); + pr_warn("Could not retrieve perf counters (%d)\n", rc); return; } - highest_perf = amd_get_highest_perf(); + rc = amd_get_boost_ratio_numerator(0, &numerator); + if (rc) { + pr_warn("Could not retrieve highest performance (%d)\n", rc); + return; + } nominal_perf = perf_caps.nominal_perf; - if (!highest_perf || !nominal_perf) { - pr_debug("Could not retrieve highest or nominal performance\n"); + if (!nominal_perf) { + pr_warn("Could not retrieve nominal performance\n"); return; } - perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); /* midpoint between max_boost and max_P */ - perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; - if (!perf_ratio) { - pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return; - } + perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1; freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); -void init_freq_invariance_cppc(void) +static inline void init_freq_invariance_cppc(void) { static bool init_done; @@ -116,3 +128,171 @@ void init_freq_invariance_cppc(void) init_done = true; mutex_unlock(&freq_invariance_lock); } + +void acpi_processor_init_invariance_cppc(void) +{ + init_freq_invariance_cppc(); +} + +/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address for highest performance value. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) +{ + u64 val; + int ret; + + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + if (ret) + goto out; + + val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val); + } else { + ret = cppc_get_highest_perf(cpu, &val); + if (ret) + goto out; + } + + WRITE_ONCE(*highest_perf, (u32)val); +out: + return ret; +} +EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +/** + * amd_detect_prefcore: Detect if CPUs in the system support preferred cores + * @detected: Output variable for the result of the detection. + * + * Determine whether CPUs in the system support preferred cores. On systems + * that support preferred cores, different highest perf values will be found + * on different cores. On other systems, the highest perf value will be the + * same on all cores. + * + * The result of the detection will be stored in the 'detected' parameter. + * + * Return: 0 for success, negative error code otherwise + */ +int amd_detect_prefcore(bool *detected) +{ + int cpu, count = 0; + u64 highest_perf[2] = {0}; + + if (WARN_ON(!detected)) + return -EINVAL; + + switch (amd_pref_core_detected) { + case AMD_PREF_CORE_SUPPORTED: + *detected = true; + return 0; + case AMD_PREF_CORE_UNSUPPORTED: + *detected = false; + return 0; + default: + break; + } + + for_each_present_cpu(cpu) { + u32 tmp; + int ret; + + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + + if (!count || (count == 1 && tmp != highest_perf[0])) + highest_perf[count++] = tmp; + + if (count == 2) + break; + } + + *detected = (count == 2); + boost_numerator = highest_perf[0]; + + amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED : + AMD_PREF_CORE_UNSUPPORTED; + + pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n", + *detected ? "" : "un", highest_perf[0]); + + return 0; +} +EXPORT_SYMBOL_GPL(amd_detect_prefcore); + +/** + * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation + * @cpu: CPU to get numerator for. + * @numerator: Output variable for numerator. + * + * Determine the numerator to use for calculating the boost ratio on + * a CPU. On systems that support preferred cores, this will be a hardcoded + * value. On other systems this will the highest performance register value. + * + * If booting the system with amd-pstate enabled but preferred cores disabled then + * the correct boost numerator will be returned to match hardware capabilities + * even if the preferred cores scheduling hints are not enabled. + * + * Return: 0 for success, negative error code otherwise. + */ +int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) +{ + enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); + bool prefcore; + int ret; + u32 tmp; + + ret = amd_detect_prefcore(&prefcore); + if (ret) + return ret; + + /* without preferred cores, return the highest perf register value */ + if (!prefcore) { + *numerator = boost_numerator; + return 0; + } + + /* + * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f, + * the highest performance level is set to 196. + * https://bugzilla.kernel.org/show_bug.cgi?id=218759 + */ + if (cpu_feature_enabled(X86_FEATURE_ZEN4)) { + switch (boot_cpu_data.x86_model) { + case 0x70 ... 0x7f: + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + default: + break; + } + } + + /* detect if running on heterogeneous design */ + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) { + switch (core_type) { + case TOPO_CPU_TYPE_UNKNOWN: + pr_warn("Undefined core type found for cpu %d\n", cpu); + break; + case TOPO_CPU_TYPE_PERFORMANCE: + /* use the max scale for performance cores */ + *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; + return 0; + case TOPO_CPU_TYPE_EFFICIENCY: + /* use the highest perf value for efficiency cores */ + ret = amd_get_highest_perf(cpu, &tmp); + if (ret) + return ret; + *numerator = tmp; + return 0; + } + } + + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f3ffd0a3a012..8698d66563ed 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -13,8 +13,11 @@ #include <linux/sched.h> #include <acpi/processor.h> +#include <asm/cpu_device_id.h> +#include <asm/cpuid/api.h> #include <asm/mwait.h> #include <asm/special_insns.h> +#include <asm/smp.h> /* * Initialize bm_flags based on the CPU cache properties @@ -46,12 +49,11 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, /* * On all recent Intel platforms, ARB_DISABLE is a nop. * So, set bm_control to zero to indicate that ARB_DISABLE - * is not required while entering C3 type state on - * P4, Core and beyond CPUs + * is not required while entering C3 type state. */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) - flags->bm_control = 0; + (c->x86 > 15 || (c->x86_vfm >= INTEL_CORE2_MEROM && c->x86_vfm <= INTEL_FAM6_LAST))) + flags->bm_control = 0; if (c->x86_vendor == X86_VENDOR_CENTAUR) { if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && @@ -128,7 +130,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) unsigned int cstate_type; /* C-state type and not ACPI C-state type */ unsigned int num_cstate_subtype; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* Check whether this particular cx_type (in CST) is supported or not */ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) & @@ -172,7 +174,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); long retval; - if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT) return -1; if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) @@ -204,6 +206,16 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_play_dead(percpu_entry->states[cx->index].eax); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead); + void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S new file mode 100644 index 000000000000..aefb9cb583ad --- /dev/null +++ b/arch/x86/kernel/acpi/madt_playdead.S @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/linkage.h> +#include <asm/nospec-branch.h> +#include <asm/page_types.h> +#include <asm/processor-flags.h> + + .text + .align PAGE_SIZE + +/* + * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS + * + * rdi: Address of the ACPI MADT MPWK ResetVector + * rsi: PGD of the identity mapping + */ +SYM_FUNC_START(asm_acpi_mp_play_dead) + ANNOTATE_NOENDBR + /* Turn off global entries. Following CR3 write will flush them. */ + movq %cr4, %rdx + andq $~(X86_CR4_PGE), %rdx + movq %rdx, %cr4 + + /* Switch to identity mapping */ + movq %rsi, %cr3 + + /* Jump to reset vector */ + ANNOTATE_RETPOLINE_SAFE + jmp *%rdi +SYM_FUNC_END(asm_acpi_mp_play_dead) diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c new file mode 100644 index 000000000000..6d7603511f52 --- /dev/null +++ b/arch/x86/kernel/acpi/madt_wakeup.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/acpi.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/io.h> +#include <linux/kexec.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> +#include <linux/sched/hotplug.h> +#include <asm/apic.h> +#include <asm/barrier.h> +#include <asm/init.h> +#include <asm/intel_pt.h> +#include <asm/nmi.h> +#include <asm/processor.h> +#include <asm/reboot.h> + +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; + +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; + +static u64 acpi_mp_pgd __ro_after_init; +static u64 acpi_mp_reset_vector_paddr __ro_after_init; + +static void acpi_mp_stop_this_cpu(void) +{ + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); +} + +static void acpi_mp_play_dead(void) +{ + play_dead_common(); + asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); +} + +static void acpi_mp_cpu_die(unsigned int cpu) +{ + u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); + unsigned long timeout; + + /* + * Use TEST mailbox command to prove that BIOS got control over + * the CPU before declaring it dead. + * + * BIOS has to clear 'command' field of the mailbox. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_TEST); + + /* Don't wait longer than a second. */ + timeout = USEC_PER_SEC; + while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) + udelay(1); + + if (!timeout) + pr_err("Failed to hand over CPU %d to BIOS\n", cpu); +} + +/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ +static void __init *alloc_pgt_page(void *dummy) +{ + return memblock_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static void __init free_pgt_page(void *pgt, void *dummy) +{ + return memblock_free(pgt, PAGE_SIZE); +} + +static int __init acpi_mp_setup_reset(u64 reset_vector) +{ + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .free_pgt_page = free_pgt_page, + .page_flag = __PAGE_KERNEL_LARGE_EXEC, + .kernpg_flag = _KERNPG_TABLE_NOENC, + }; + unsigned long mstart, mend; + pgd_t *pgd; + + pgd = alloc_pgt_page(NULL); + if (!pgd) + return -ENOMEM; + + for (int i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + } + + mstart = PAGE_ALIGN_DOWN(reset_vector); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + + /* + * Make sure asm_acpi_mp_play_dead() is present in the identity mapping + * at the same place as in the kernel page tables. + * asm_acpi_mp_play_dead() switches to the identity mapping and the + * function must be present at the same spot in the virtual address space + * before and after switching page tables. + */ + info.offset = __START_KERNEL_map - phys_base; + mstart = PAGE_ALIGN_DOWN(__pa(asm_acpi_mp_play_dead)); + mend = mstart + PAGE_SIZE; + if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { + kernel_ident_mapping_free(&info, pgd); + return -ENOMEM; + } + + smp_ops.play_dead = acpi_mp_play_dead; + smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; + smp_ops.cpu_die = acpi_mp_cpu_die; + + acpi_mp_reset_vector_paddr = reset_vector; + acpi_mp_pgd = __pa(pgd); + + return 0; +} + +static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip, unsigned int cpu) +{ + if (!acpi_mp_wake_mailbox_paddr) { + pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); + return -EOPNOTSUPP; + } + + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgment. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} + +static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) +{ + cpu_hotplug_disable_offlining(); + + /* + * ACPI MADT doesn't allow to offline a CPU after it was onlined. This + * limits kexec: the second kernel won't be able to use more than one CPU. + * + * To prevent a kexec kernel from onlining secondary CPUs invalidate the + * mailbox address in the ACPI MADT wakeup structure which prevents a + * kexec kernel to use it. + * + * This is safe as the booting kernel has the mailbox address cached + * already and acpi_wakeup_cpu() uses the cached value to bring up the + * secondary CPUs. + * + * Note: This is a Linux specific convention and not covered by the + * ACPI specification. + */ + mp_wake->mailbox_address = 0; +} + +int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + + /* + * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake + * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger + * than the actual size of the MP wakeup entry in ACPI table because the + * 'reset_vector' is only available in the V1 MP wakeup structure. + */ + if (!mp_wake) + return -EINVAL; + if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) + return -EINVAL; + if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; + + if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && + mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { + if (acpi_mp_setup_reset(mp_wake->reset_vector)) { + pr_warn("Failed to setup MADT reset vector\n"); + acpi_mp_disable_offlining(mp_wake); + } + } else { + /* + * CPU offlining requires version 1 of the ACPI MADT wakeup + * structure. + */ + acpi_mp_disable_offlining(mp_wake); + } + + apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); + + return 0; +} diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 6dfecb27b846..91fa262f0e30 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> #include <asm/hypervisor.h> +#include <asm/msr.h> #include <asm/smp.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 94ff83f3d3fe..04f561f75e99 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -17,6 +17,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ SYM_FUNC_START(wakeup_long64) + ANNOTATE_NOENDBR movq saved_magic(%rip), %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax @@ -87,6 +88,7 @@ SYM_FUNC_START(do_suspend_lowlevel) .align 4 .Lresume_point: + ANNOTATE_NOENDBR /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45a280f2161c..ea1d984166cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,36 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> +#include <linux/execmem.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> +#include <asm/ibt.h> +#include <asm/set_memory.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -124,6 +105,214 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = #endif }; +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + +#ifdef CONFIG_MITIGATION_ITS + +#ifdef CONFIG_MODULES +static struct module *its_mod; +#endif +static void *its_page; +static unsigned int its_offset; +struct its_array its_pages; + +static void *__its_alloc(struct its_array *pages) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + if (!page) + return NULL; + + void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + pages->pages = tmp; + pages->pages[pages->num++] = page; + + return no_free_ptr(page); +} + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int offset = 0; + int i = 0; + +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence (<ea> + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk + offset; +} + +static void its_pages_protect(struct its_array *pages) +{ + for (int i = 0; i < pages->num; i++) { + void *page = pages->pages[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +static void its_fini_core(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + its_pages_protect(&its_pages); + kfree(its_pages.pages); +} + +#ifdef CONFIG_MODULES +void its_init_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + WARN_ON_ONCE(its_mod != mod); + + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); + + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + its_pages_protect(&mod->arch.its_pages); +} + +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; + execmem_free(page); + } + kfree(mod->arch.its_pages.pages); +} +#endif /* CONFIG_MODULES */ + +static void *its_alloc(void) +{ + struct its_array *pages = &its_pages; + void *page; + +#ifdef CONFIG_MODULES + if (its_mod) + pages = &its_mod->arch.its_pages; +#endif + + page = __its_alloc(pages); + if (!page) + return NULL; + + execmem_make_temp_rw(page, PAGE_SIZE); + if (pages == &its_pages) + set_memory_x((unsigned long)page, 1); + + return page; +} + +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif + + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } + + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; + + thunk = its_page + its_offset; + its_offset += size; + + return its_init_thunk(thunk, reg); +} + +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + +#else +static inline void its_fini_core(void) {} +#endif /* CONFIG_MITIGATION_ITS */ + +/* + * Nomenclature for variable names to simplify and clarify this code and ease + * any potential staring at it: + * + * @instr: source address of the original instructions in the kernel text as + * generated by the compiler. + * + * @buf: temporary buffer on which the patching operates. This buffer is + * eventually text-poked into the kernel image. + * + * @replacement/@repl: pointer to the opcodes which are replacing @instr, located + * in the .altinstr_replacement section. + */ + /* * Fill the buffer with a single effective instruction of size @len. * @@ -133,37 +322,30 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and * *jump* over instead of executing long and daft NOPs. */ -static void add_nop(u8 *instr, unsigned int len) +static void add_nop(u8 *buf, unsigned int len) { - u8 *target = instr + len; + u8 *target = buf + len; if (!len) return; if (len <= ASM_NOP_MAX) { - memcpy(instr, x86_nops[len], len); + memcpy(buf, x86_nops[len], len); return; } if (len < 128) { - __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE); - instr += JMP8_INSN_SIZE; + __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); + buf += JMP8_INSN_SIZE; } else { - __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE); - instr += JMP32_INSN_SIZE; + __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); + buf += JMP32_INSN_SIZE; } - for (;instr < target; instr++) - *instr = INT3_INSN_OPCODE; + for (;buf < target; buf++) + *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -187,12 +369,12 @@ static bool insn_is_nop(struct insn *insn) * Find the offset of the first non-NOP instruction starting at @offset * but no further than @len. */ -static int skip_nops(u8 *instr, int offset, int len) +static int skip_nops(u8 *buf, int offset, int len) { struct insn insn; for (; offset < len; offset += insn.length) { - if (insn_decode_kernel(&insn, &instr[offset])) + if (insn_decode_kernel(&insn, &buf[offset])) break; if (!insn_is_nop(&insn)) @@ -203,66 +385,32 @@ static int skip_nops(u8 *instr, int offset, int len) } /* - * Optimize a sequence of NOPs, possibly preceded by an unconditional jump - * to the end of the NOP sequence into a single NOP. - */ -static bool -__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target) -{ - int i = *next - insn->length; - - switch (insn->opcode.bytes[0]) { - case JMP8_INSN_OPCODE: - case JMP32_INSN_OPCODE: - *prev = i; - *target = *next + insn->immediate.value; - return false; - } - - if (insn_is_nop(insn)) { - int nop = i; - - *next = skip_nops(instr, *next, len); - if (*target && *next == *target) - nop = *prev; - - add_nop(instr + nop, *next - nop); - DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next); - return true; - } - - *target = 0; - return false; -} - -/* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { - int prev, target = 0; - for (int next, i = 0; i < len; i = next) { struct insn insn; - if (insn_decode_kernel(&insn, &instr[i])) + if (insn_decode_kernel(&insn, &buf[i])) return; next = i + insn.length; - __optimize_nops(instr, len, &insn, &next, &prev, &target); - } -} + if (insn_is_nop(&insn)) { + int nop = i; -static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len) -{ - unsigned long flags; + /* Has the NOP already been optimized? */ + if (i + insn.length == len) + return; - local_irq_save(flags); - optimize_nops(instr, len); - sync_core(); - local_irq_restore(flags); + next = skip_nops(buf, next, len); + + add_nop(buf + nop, next - nop); + DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); + } + } } /* @@ -335,11 +483,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len) return (target < src || target > src + src_len); } -void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) +static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { - int prev, target = 0; - - for (int next, i = 0; i < len; i = next) { + for (int next, i = 0; i < instrlen; i = next) { struct insn insn; if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) @@ -347,9 +493,6 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) next = i + insn.length; - if (__optimize_nops(buf, len, &insn, &next, &prev, &target)) - continue; - switch (insn.opcode.bytes[0]) { case 0x0f: if (insn.opcode.bytes[1] < 0x80 || @@ -361,10 +504,10 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) case JMP8_INSN_OPCODE: case JMP32_INSN_OPCODE: case CALL_INSN_OPCODE: - if (need_reloc(next + insn.immediate.value, src, src_len)) { + if (need_reloc(next + insn.immediate.value, repl, repl_len)) { apply_reloc(insn.immediate.nbytes, buf + i + insn_offset_immediate(&insn), - src - dest); + repl - instr); } /* @@ -372,7 +515,7 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) */ if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { s32 imm = insn.immediate.value; - imm += src - dest; + imm += repl - instr; imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; if ((imm >> 31) == (imm >> 7)) { buf[i+0] = JMP8_INSN_OPCODE; @@ -385,15 +528,21 @@ void apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len) } if (insn_rip_relative(&insn)) { - if (need_reloc(next + insn.displacement.value, src, src_len)) { + if (need_reloc(next + insn.displacement.value, repl, repl_len)) { apply_reloc(insn.displacement.nbytes, buf + i + insn_offset_displacement(&insn), - src - dest); + repl - instr); } } } } +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +{ + __apply_relocation(buf, instr, instrlen, repl, repl_len); + optimize_nops(instr, buf, instrlen); +} + /* Low-level backend functions usable from alternative code replacements. */ DEFINE_ASM_FUNC(nop_func, "", .entry.text); EXPORT_SYMBOL_GPL(nop_func); @@ -451,6 +600,11 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) return 5; } +static inline u8 * instr_va(struct alt_instr *i) +{ + return (u8 *)&i->instr_offset + i->instr_offset; +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -464,14 +618,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - struct alt_instr *a; - u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; + u8 *instr, *replacement; + struct alt_instr *a, *b; DPRINTK(ALT, "alt table %px, -> %px", start, end); /* - * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. @@ -492,7 +646,18 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (a = start; a < end; a++) { int insn_buff_sz = 0; - instr = (u8 *)&a->instr_offset + a->instr_offset; + /* + * In case of nested ALTERNATIVE()s the outer alternative might + * add more padding. To ensure consistent patching find the max + * padding for all alt_instr entries for this site (nested + * alternatives result in consecutive entries). + */ + for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { + u8 len = max(a->instrlen, b->instrlen); + a->instrlen = b->instrlen = len; + } + + instr = instr_va(a); replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -504,7 +669,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - optimize_nops_inplace(instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); + optimize_nops(instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -526,7 +693,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -582,7 +749,8 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } -static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) { u8 op = insn->opcode.bytes[0]; int i = 0; @@ -603,7 +771,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 switch (op) { case CALL_INSN_OPCODE: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_call_thunk_array[reg], + call_dest, CALL_INSN_SIZE); i += CALL_INSN_SIZE; break; @@ -611,7 +779,7 @@ static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 case JMP32_INSN_OPCODE: clang_jcc: __text_gen_insn(bytes+i, op, addr+i, - __x86_indirect_jump_thunk_array[reg], + jmp_dest, JMP32_INSN_SIZE); i += JMP32_INSN_SIZE; break; @@ -626,6 +794,48 @@ clang_jcc: return i; } +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_call_thunk_array[reg], + __x86_indirect_jump_thunk_array[reg]); +} + +#ifdef CONFIG_MITIGATION_ITS +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} +#endif + +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -700,6 +910,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; @@ -733,6 +952,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -742,8 +962,19 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) op2 = insn.opcode.bytes[1]; switch (op1) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + /* See cfi_paranoid. */ + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -761,7 +992,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { - optimize_nops(bytes, len); + optimize_nops(addr, bytes, len); DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); @@ -771,6 +1002,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_MITIGATION_RETHUNK +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -787,7 +1033,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk_at(addr)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { @@ -804,7 +1050,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) static_call_force_reinit(); for (s = start; s < end; s++) { @@ -840,32 +1086,53 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) } } } -#else +#else /* !CONFIG_MITIGATION_RETHUNK: */ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETHUNK */ +#endif /* !CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr); +__noendbr bool is_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); + +Efault: + return false; +} -static void __init_or_module poison_endbr(void *addr, bool warn) +#ifdef CONFIG_FINEIBT + +static __noendbr bool exact_endbr(u32 *val) { - u32 endbr, poison = gen_endbr_poison(); + u32 endbr; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) - return; + __get_kernel_nofault(&endbr, val, u32, Efault); + return endbr == gen_endbr(); - if (!is_endbr(endbr)) { - WARN_ON_ONCE(warn); +Efault: + return false; +} + +#endif + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr) +{ + u32 poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(!is_endbr(addr))) return; - } DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); @@ -890,28 +1157,32 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - poison_endbr(addr, true); + poison_endbr(addr); if (IS_ENABLED(CONFIG_FINEIBT)) poison_cfi(addr - 16); } } -#else +#else /* !CONFIG_X86_KERNEL_IBT: */ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } -#endif /* CONFIG_X86_KERNEL_IBT */ +#endif /* !CONFIG_X86_KERNEL_IBT */ -#ifdef CONFIG_FINEIBT -#define __CFI_DEFAULT CFI_DEFAULT +#ifdef CONFIG_CFI_AUTO_DEFAULT +# define __CFI_DEFAULT CFI_AUTO #elif defined(CONFIG_CFI_CLANG) -#define __CFI_DEFAULT CFI_KCFI +# define __CFI_DEFAULT CFI_KCFI #else -#define __CFI_DEFAULT CFI_OFF +# define __CFI_DEFAULT CFI_OFF #endif enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; +#ifdef CONFIG_FINEIBT_BHI +bool cfi_bhi __ro_after_init = false; +#endif + #ifdef CONFIG_CFI_CLANG struct bpf_insn; @@ -919,11 +1190,7 @@ struct bpf_insn; extern unsigned int __bpf_prog_runX(const void *ctx, const struct bpf_insn *insn); -/* - * Force a reference to the external symbol so the compiler generates - * __kcfi_typid. - */ -__ADDRESSABLE(__bpf_prog_runX); +KCFI_REFERENCE(__bpf_prog_runX); /* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ asm ( @@ -940,7 +1207,7 @@ asm ( /* Must match bpf_callback_t */ extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); -__ADDRESSABLE(__bpf_callback_fn); +KCFI_REFERENCE(__bpf_callback_fn); /* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ asm ( @@ -975,6 +1242,21 @@ u32 cfi_get_func_hash(void *func) return hash; } + +int cfi_get_func_arity(void *func) +{ + bhi_thunk *target; + s32 disp; + + if (cfi_mode != CFI_FINEIBT && !cfi_bhi) + return 0; + + if (get_kernel_nofault(disp, func - 4)) + return 0; + + target = func + disp; + return target - __bhi_args; +} #endif #ifdef CONFIG_FINEIBT @@ -989,7 +1271,7 @@ static u32 cfi_seed __ro_after_init; static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; - while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) @@ -1011,7 +1293,7 @@ static __init int cfi_parse_cmdline(char *str) } if (!strcmp(str, "auto")) { - cfi_mode = CFI_DEFAULT; + cfi_mode = CFI_AUTO; } else if (!strcmp(str, "off")) { cfi_mode = CFI_OFF; cfi_rand = false; @@ -1021,6 +1303,25 @@ static __init int cfi_parse_cmdline(char *str) cfi_mode = CFI_FINEIBT; } else if (!strcmp(str, "norand")) { cfi_rand = false; + } else if (!strcmp(str, "warn")) { + pr_alert("CFI mismatch non-fatal!\n"); + cfi_warn = true; + } else if (!strcmp(str, "paranoid")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_paranoid = true; + } else { + pr_err("Ignoring paranoid; depends on fineibt.\n"); + } + } else if (!strcmp(str, "bhi")) { +#ifdef CONFIG_FINEIBT_BHI + if (cfi_mode == CFI_FINEIBT) { + cfi_bhi = true; + } else { + pr_err("Ignoring bhi; depends on fineibt.\n"); + } +#else + pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); +#endif } else { pr_err("Ignoring unknown cfi option (%s).", str); } @@ -1038,9 +1339,9 @@ early_param("cfi", cfi_parse_cmdline); * __cfi_\func: __cfi_\func: * movl $0x12345678,%eax // 5 endbr64 // 4 * nop subl $0x12345678,%r10d // 7 - * nop jz 1f // 2 - * nop ud2 // 2 - * nop 1: nop // 1 + * nop jne __cfi_\func+6 // 2 + * nop nop3 // 3 + * nop * nop * nop * nop @@ -1052,34 +1353,53 @@ early_param("cfi", cfi_parse_cmdline); * * caller: caller: * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 - * addl $-15(%r11),%r10d // 4 sub $16,%r11 // 4 + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 * je 1f // 2 nop4 // 4 * ud2 // 2 - * 1: call __x86_indirect_thunk_r11 // 5 call *%r11; nop2; // 5 + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 * */ -asm( ".pushsection .rodata \n" - "fineibt_preamble_start: \n" - " endbr64 \n" - " subl $0x12345678, %r10d \n" - " je fineibt_preamble_end \n" - " ud2 \n" - " nop \n" - "fineibt_preamble_end: \n" +/* + * <fineibt_preamble_start>: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d + * b: 75 f9 jne 6 <fineibt_preamble_start+0x6> + * d: 0f 1f 00 nopl (%rax) + * + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as + * (bad) on x86_64 and raises #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + "fineibt_preamble_bhi: \n" + " jne fineibt_preamble_start+6 \n" + ASM_NOP3 + "fineibt_preamble_end: \n" ".popsection\n" ); extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_bhi[]; extern u8 fineibt_preamble_end[]; #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) +#define fineibt_preamble_ud 6 #define fineibt_preamble_hash 7 +/* + * <fineibt_caller_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * a: 0f 1f 40 00 nopl 0x0(%rax) + */ asm( ".pushsection .rodata \n" "fineibt_caller_start: \n" " movl $0x12345678, %r10d \n" - " sub $16, %r11 \n" + " lea -0x10(%r11), %r11 \n" ASM_NOP4 "fineibt_caller_end: \n" ".popsection \n" @@ -1093,13 +1413,62 @@ extern u8 fineibt_caller_end[]; #define fineibt_caller_jmp (fineibt_caller_size - 2) -static u32 decode_preamble_hash(void *addr) +/* + * Since FineIBT does hash validation on the callee side it is prone to + * circumvention attacks where a 'naked' ENDBR instruction exists that + * is not part of the fineibt_preamble sequence. + * + * Notably the x86 entry points must be ENDBR and equally cannot be + * fineibt_preamble. + * + * The fineibt_paranoid caller sequence adds additional caller side + * hash validation. This stops such circumvention attacks dead, but at the cost + * of adding a load. + * + * <fineibt_paranoid_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11 + * e: 75 fd jne d <fineibt_paranoid_start+0xd> + * 10: 41 ff d3 call *%r11 + * 13: 90 nop + * + * Notably LEA does not modify flags and can be reordered with the CMP, + * avoiding a dependency. Again, using a non-taken (backwards) branch + * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the + * Jcc.d8, causing #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_paranoid_start: \n" + " movl $0x12345678, %r10d \n" + " cmpl -9(%r11), %r10d \n" + " lea -0x10(%r11), %r11 \n" + " jne fineibt_paranoid_start+0xd \n" + "fineibt_paranoid_ind: \n" + " call *%r11 \n" + " nop \n" + "fineibt_paranoid_end: \n" + ".popsection \n" +); + +extern u8 fineibt_paranoid_start[]; +extern u8 fineibt_paranoid_ind[]; +extern u8 fineibt_paranoid_end[]; + +#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) +#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) +#define fineibt_paranoid_ud 0xd + +static u32 decode_preamble_hash(void *addr, int *reg) { u8 *p = addr; - /* b8 78 56 34 12 mov $0x12345678,%eax */ - if (p[0] == 0xb8) + /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ + if (p[0] >= 0xb8 && p[0] < 0xc0) { + if (reg) + *reg = p[0] - 0xb8; return *(u32 *)(addr + 1); + } return 0; /* invalid hash value */ } @@ -1108,11 +1477,11 @@ static u32 decode_caller_hash(void *addr) { u8 *p = addr; - /* 41 ba 78 56 34 12 mov $0x12345678,%r10d */ + /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ if (p[0] == 0x41 && p[1] == 0xba) return -*(u32 *)(addr + 2); - /* e8 0c 78 56 34 12 jmp.d8 +12 */ + /* e8 0c 88 a9 cb ed jmp.d8 +12 */ if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) return -*(u32 *)(addr + 2); @@ -1177,7 +1546,7 @@ static int cfi_rand_preamble(s32 *start, s32 *end) void *addr = (void *)s + *s; u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(addr, NULL); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; @@ -1189,15 +1558,71 @@ static int cfi_rand_preamble(s32 *start, s32 *end) return 0; } +static void cfi_fineibt_bhi_preamble(void *addr, int arity) +{ + if (!arity) + return; + + if (!cfi_warn && arity == 1) { + /* + * Crazy scheme to allow arity-1 inline: + * + * __cfi_foo: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d + * b: 49 0f 45 fa cmovne %r10, %rdi + * f: 75 f5 jne __cfi_foo+6 + * 11: 0f 1f 00 nopl (%rax) + * + * Code that direct calls to foo()+0, decodes the tail end as: + * + * foo: + * 0: f5 cmc + * 1: 0f 1f 00 nopl (%rax) + * + * which clobbers CF, but does not affect anything ABI + * wise. + * + * Notably, this scheme is incompatible with permissive CFI + * because the CMOVcc is unconditional and RDI will have been + * clobbered. + */ + const u8 magic[9] = { + 0x49, 0x0f, 0x45, 0xfa, + 0x75, 0xf5, + BYTES_NOP3, + }; + + text_poke_early(addr + fineibt_preamble_bhi, magic, 9); + + return; + } + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); +} + static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + int arity; u32 hash; - hash = decode_preamble_hash(addr); + /* + * When the function doesn't start with ENDBR the compiler will + * have determined there are no indirect calls to it and we + * don't need no CFI either. + */ + if (!is_endbr(addr + 16)) + continue; + + hash = decode_preamble_hash(addr, &arity); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; @@ -1205,6 +1630,13 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + + WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, + "kCFI preamble has wrong register at: %pS %*ph\n", + addr, 5, addr); + + if (cfi_bhi) + cfi_fineibt_bhi_preamble(addr, arity); } return 0; @@ -1217,7 +1649,10 @@ static void cfi_rewrite_endbr(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - poison_endbr(addr+16, false); + if (!exact_endbr(addr + 16)) + continue; + + poison_endbr(addr + 16); } } @@ -1241,22 +1676,70 @@ static int cfi_rand_callers(s32 *start, s32 *end) return 0; } +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; + BUG_ON(fineibt_paranoid_size != 20); + for (s = start; s < end; s++) { void *addr = (void *)s + *s; + struct insn insn; + u8 bytes[20]; u32 hash; + int ret; + u8 op; addr -= fineibt_caller_size; hash = decode_caller_hash(addr); - if (hash) { + if (!hash) + continue; + + if (!cfi_paranoid) { text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); text_poke_early(addr + fineibt_caller_hash, &hash, 4); + /* rely on apply_retpolines() */ + continue; + } + + /* cfi_paranoid */ + ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { + WARN_ON_ONCE(1); + continue; } - /* rely on apply_retpolines() */ + + memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); + memcpy(bytes + fineibt_caller_hash, &hash, 4); + + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } + + text_poke_early(addr, bytes, fineibt_paranoid_size); } return 0; @@ -1271,10 +1754,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) return; - if (cfi_mode == CFI_DEFAULT) { + if (cfi_mode == CFI_AUTO) { cfi_mode = CFI_KCFI; - if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { + /* + * FRED has much saner context on exception entry and + * is less easy to take advantage of. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + cfi_paranoid = true; cfi_mode = CFI_FINEIBT; + } } /* @@ -1331,8 +1821,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, /* now that nobody targets func()+0, remove ENDBR there */ cfi_rewrite_endbr(start_cfi, end_cfi); - if (builtin) - pr_info("Using FineIBT CFI\n"); + if (builtin) { + pr_info("Using %sFineIBT%s CFI\n", + cfi_paranoid ? "paranoid " : "", + cfi_bhi ? "+BHI" : ""); + } return; default: @@ -1350,9 +1843,23 @@ static inline void poison_hash(void *addr) static void poison_cfi(void *addr) { + /* + * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, + * some (static) functions for which they can determine the address + * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. + * + * As such, these functions will get sealed, but we need to be careful + * to not unconditionally scribble the previous function. + */ switch (cfi_mode) { case CFI_FINEIBT: /* + * FineIBT prefix should start with an ENDBR. + */ + if (!is_endbr(addr)) + break; + + /* * __cfi_\func: * osp nopl (%rax) * subl $0, %r10d @@ -1360,12 +1867,18 @@ static void poison_cfi(void *addr) * ud2 * 1: nop */ - poison_endbr(addr, false); + poison_endbr(addr); poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: /* + * kCFI prefix should start with a valid hash. + */ + if (!decode_preamble_hash(addr, NULL)) + break; + + /* * __cfi_\func: * movl $0, %eax * .skip 11, 0x90 @@ -1378,7 +1891,154 @@ static void poison_cfi(void *addr) } } -#else +/* + * When regs->ip points to a 0xEA byte in the FineIBT preamble, + * return true and fill out target and type. + * + * We check the preamble by checking for the ENDBR instruction relative to the + * 0xEA instruction. + */ +static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_preamble_ud; + u32 hash; + + if (!exact_endbr((void *)addr)) + return false; + + *target = addr + fineibt_preamble_size; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * Since regs->ip points to the middle of an instruction; it cannot + * continue with the normal fixup. + */ + regs->ip = *target; + + return true; + +Efault: + return false; +} + +/* + * regs->ip points to one of the UD2 in __bhi_args[]. + */ +static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr; + u32 hash; + + if (!cfi_bhi) + return false; + + if (regs->ip < (unsigned long)__bhi_args || + regs->ip >= (unsigned long)__bhi_args_end) + return false; + + /* + * Fetch the return address from the stack, this points to the + * FineIBT preamble. Since the CALL instruction is in the 5 last + * bytes of the preamble, the return address is in fact the target + * address. + */ + __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); + *target = addr; + + addr -= fineibt_preamble_size; + if (!exact_endbr((void *)addr)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * The UD2 sites are constructed with a RET immediately following, + * as such the non-fatal case can use the regular fixup. + */ + return true; + +Efault: + return false; +} + +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + +/* + * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. + */ +static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_paranoid_ud; + + if (!cfi_paranoid) + return false; + + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } + + /* + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: <ea> (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * + */ + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } + + return false; +} + +bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + if (decode_fineibt_paranoid(regs, target, type)) + return true; + + if (decode_fineibt_bhi(regs, target, type)) + return true; + + return decode_fineibt_preamble(regs, target, type); +} + +#else /* !CONFIG_FINEIBT: */ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi, bool builtin) @@ -1389,7 +2049,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, static void poison_cfi(void *addr) { } #endif -#endif +#endif /* !CONFIG_FINEIBT */ void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, s32 *start_cfi, s32 *end_cfi) @@ -1645,7 +2305,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -1658,7 +2318,7 @@ static noinline void __init alt_reloc_selftest(void) */ asm_inline volatile ( ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) - : /* output */ + : ASM_CALL_CONSTRAINT : [mem] "m" (__alt_reloc_selftest_addr) : _ASM_ARG1 ); @@ -1666,6 +2326,8 @@ static noinline void __init alt_reloc_selftest(void) void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -1692,6 +2354,9 @@ void __init alternative_instructions(void) */ paravirt_set_cap(); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(/*disable*/ true); + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, __cfi_sites, __cfi_sites_end, true); @@ -1702,19 +2367,23 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); - apply_alternatives(__alt_instructions, __alt_instructions_end); + its_fini_core(); /* - * Now all calls are established. Apply the call thunks if - * required. + * Adjust all CALL instructions to point to func()-10, including + * those in .altinstr_replacement. */ callthunks_patch_builtin_calls(); + apply_alternatives(__alt_instructions, __alt_instructions_end); + /* * Seal all functions that do not have their address taken. */ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + ibt_restore(ibt); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { @@ -1775,72 +2444,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -1860,7 +2465,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -1897,7 +2502,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -1907,21 +2512,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1930,22 +2535,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2081,7 +2686,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2091,64 +2696,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2157,41 +2764,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2204,16 +2810,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2223,51 +2829,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP + * + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2280,33 +2885,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2315,7 +2920,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -2346,7 +2951,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -2355,63 +2960,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -2420,14 +3041,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -2436,21 +3057,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -2461,51 +3082,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; - - if (!addr) /* force */ - return true; + WARN_ON_ONCE(!addr); - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + if (!text_poke_array.nr_entries) return true; - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -2513,12 +3133,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 2ae98f754e59..3485d419c2f5 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -39,7 +39,7 @@ #include <asm/gart.h> #include <asm/set_memory.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ @@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, - .alloc_pages = dma_direct_alloc_pages, + .alloc_pages_op = dma_direct_alloc_pages, .free_pages = dma_direct_free_pages, }; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 5bf5f9fc5753..c1acead6227a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,62 +13,11 @@ #include <linux/export.h> #include <linux/spinlock.h> #include <linux/pci_ids.h> -#include <asm/amd_nb.h> - -#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 -#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0 -#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480 -#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630 -#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4 -#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5 -#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8 -#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8 -#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a -#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507 -#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb -#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8 - -#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 -#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec -#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494 -#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c -#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444 -#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728 -#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654 -#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1 -#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d -#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e -#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 -#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 -#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc -#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4 -#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4 -#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c - -/* Protect the PCI config register pairs used for SMN. */ -static DEFINE_MUTEX(smn_mutex); -static u32 *flush_words; - -static const struct pci_device_id amd_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) }, - {} -}; +#include <asm/amd/nb.h> +#include <asm/cpuid/api.h> -#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 +static u32 *flush_words; static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, @@ -79,65 +28,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) }, - {} -}; - -static const struct pci_device_id amd_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) }, - {} -}; - -static const struct pci_device_id hygon_root_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) }, - {} -}; - -static const struct pci_device_id hygon_nb_misc_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, - {} -}; - -static const struct pci_device_id hygon_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -168,144 +58,42 @@ struct amd_northbridge *node_to_amd_nb(int node) } EXPORT_SYMBOL_GPL(node_to_amd_nb); -static struct pci_dev *next_northbridge(struct pci_dev *dev, - const struct pci_device_id *ids) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(ids, dev)); - return dev; -} - -static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) -{ - struct pci_dev *root; - int err = -ENODEV; - - if (node >= amd_northbridges.num) - goto out; - - root = node_to_amd_nb(node)->root; - if (!root) - goto out; - - mutex_lock(&smn_mutex); - - err = pci_write_config_dword(root, 0x60, address); - if (err) { - pr_warn("Error programming SMN address 0x%x.\n", address); - goto out_unlock; - } - - err = (write ? pci_write_config_dword(root, 0x64, *value) - : pci_read_config_dword(root, 0x64, value)); - if (err) - pr_warn("Error %s SMN address 0x%x.\n", - (write ? "writing to" : "reading from"), address); - -out_unlock: - mutex_unlock(&smn_mutex); - -out: - return err; -} - -int amd_smn_read(u16 node, u32 address, u32 *value) -{ - return __amd_smn_rw(node, address, value, false); -} -EXPORT_SYMBOL_GPL(amd_smn_read); - -int amd_smn_write(u16 node, u32 address, u32 value) -{ - return __amd_smn_rw(node, address, &value, true); -} -EXPORT_SYMBOL_GPL(amd_smn_write); - - static int amd_cache_northbridges(void) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; - const struct pci_device_id *link_ids = amd_nb_link_ids; - const struct pci_device_id *root_ids = amd_root_ids; - struct pci_dev *root, *misc, *link; struct amd_northbridge *nb; - u16 roots_per_misc = 0; - u16 misc_count = 0; - u16 root_count = 0; - u16 i, j; + u16 i; if (amd_northbridges.num) return 0; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - root_ids = hygon_root_ids; - misc_ids = hygon_nb_misc_ids; - link_ids = hygon_nb_link_ids; - } - - misc = NULL; - while ((misc = next_northbridge(misc, misc_ids))) - misc_count++; - - if (!misc_count) - return -ENODEV; - - root = NULL; - while ((root = next_northbridge(root, root_ids))) - root_count++; - - if (root_count) { - roots_per_misc = root_count / misc_count; - - /* - * There should be _exactly_ N roots for each DF/SMN - * interface. - */ - if (!roots_per_misc || (root_count % roots_per_misc)) { - pr_info("Unsupported AMD DF/PCI configuration found\n"); - return -ENODEV; - } - } + amd_northbridges.num = amd_num_nodes(); - nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; - amd_northbridges.num = misc_count; - link = misc = root = NULL; for (i = 0; i < amd_northbridges.num; i++) { - node_to_amd_nb(i)->root = root = - next_northbridge(root, root_ids); - node_to_amd_nb(i)->misc = misc = - next_northbridge(misc, misc_ids); - node_to_amd_nb(i)->link = link = - next_northbridge(link, link_ids); + node_to_amd_nb(i)->misc = amd_node_get_func(i, 3); /* - * If there are more PCI root devices than data fabric/ - * system management network interfaces, then the (N) - * PCI roots per DF/SMN interface are functionally the - * same (for DF/SMN access) and N-1 are redundant. N-1 - * PCI roots should be skipped per DF/SMN interface so - * the following DF/SMN interfaces get mapped to - * correct PCI roots. + * Each Northbridge must have a 'misc' device. + * If not, then uninitialize everything. */ - for (j = 1; j < roots_per_misc; j++) - root = next_northbridge(root, root_ids); + if (!node_to_amd_nb(i)->misc) { + amd_northbridges.num = 0; + kfree(nb); + return -ENODEV; + } + + node_to_amd_nb(i)->link = amd_node_get_func(i, 4); } if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; - /* - * Check for L3 cache presence. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return 0; /* @@ -334,7 +122,6 @@ static int amd_cache_northbridges(void) */ bool __init early_is_amd_nb(u32 device) { - const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *id; u32 vendor = device & 0xffff; @@ -342,11 +129,11 @@ bool __init early_is_amd_nb(u32 device) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return false; - if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - misc_ids = hygon_nb_misc_ids; + if (cpu_feature_enabled(X86_FEATURE_ZEN)) + return false; device >>= 16; - for (id = misc_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return true; return false; @@ -354,7 +141,6 @@ bool __init early_is_amd_nb(u32 device) struct resource *amd_get_mmconfig_range(struct resource *res) { - u32 address; u64 base, msr; unsigned int segn_busn_bits; @@ -362,13 +148,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res) boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return NULL; - /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */ + if (boot_cpu_data.x86 < 0x10 || + rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) return NULL; - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - /* mmconfig is not enabled */ if (!(msr & FAM10H_MMIO_CONF_ENABLE)) return NULL; @@ -531,6 +315,10 @@ static __init void fix_erratum_688(void) static __init int init_amd_nbs(void) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return 0; + amd_cache_northbridges(); amd_cache_gart(); diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c new file mode 100644 index 000000000000..a40176b62eb5 --- /dev/null +++ b/arch/x86/kernel/amd_node.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * AMD Node helper functions and common defines + * + * Copyright (c) 2024, Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Author: Yazen Ghannam <Yazen.Ghannam@amd.com> + */ + +#include <linux/debugfs.h> +#include <asm/amd/node.h> + +/* + * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one + * or more nodes per package. + * + * The nodes are software-visible through PCI config space. All nodes are enumerated + * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8 + * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a + * multi-function device. + * + * On legacy systems, these node devices represent integrated Northbridge functionality. + * On Zen-based systems, these node devices represent Data Fabric functionality. + * + * See "Configuration Space Accesses" section in BKDGs or + * "Processor x86 Core" -> "Configuration Space" section in PPRs. + */ +struct pci_dev *amd_node_get_func(u16 node, u8 func) +{ + if (node >= MAX_AMD_NUM_NODES) + return NULL; + + return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func)); +} + +#define DF_BLK_INST_CNT 0x040 +#define DF_CFG_ADDR_CNTL_LEGACY 0x084 +#define DF_CFG_ADDR_CNTL_DF4 0xC04 + +#define DF_MAJOR_REVISION GENMASK(27, 24) + +static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0) +{ + u32 reg; + + /* + * Revision fields added for DF4 and later. + * + * Major revision of '0' is found pre-DF4. Field is Read-as-Zero. + */ + if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, ®)) + return 0; + + if (reg & DF_MAJOR_REVISION) + return DF_CFG_ADDR_CNTL_DF4; + + return DF_CFG_ADDR_CNTL_LEGACY; +} + +struct pci_dev *amd_node_get_root(u16 node) +{ + struct pci_dev *root; + u16 cntl_off; + u8 bus; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return NULL; + + /* + * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl) + * Bits [7:0] (SecBusNum) holds the bus number of the root device for + * this Data Fabric instance. The segment, device, and function will be 0. + */ + struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0); + if (!df_f0) + return NULL; + + cntl_off = get_cfg_addr_cntl_offset(df_f0); + if (!cntl_off) + return NULL; + + if (pci_read_config_byte(df_f0, cntl_off, &bus)) + return NULL; + + /* Grab the pointer for the actual root device instance. */ + root = pci_get_domain_bus_and_slot(0, bus, 0); + + pci_dbg(root, "is root for AMD node %u\n", node); + return root; +} + +static struct pci_dev **amd_roots; + +/* Protect the PCI config register pairs used for SMN. */ +static DEFINE_MUTEX(smn_mutex); +static bool smn_exclusive; + +#define SMN_INDEX_OFFSET 0x60 +#define SMN_DATA_OFFSET 0x64 + +#define HSMP_INDEX_OFFSET 0xc4 +#define HSMP_DATA_OFFSET 0xc8 + +/* + * SMN accesses may fail in ways that are difficult to detect here in the called + * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do + * their own checking based on what behavior they expect. + * + * For SMN reads, the returned value may be zero if the register is Read-as-Zero. + * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response" + * can be checked here, and a proper error code can be returned. + * + * But the Read-as-Zero response cannot be verified here. A value of 0 may be + * correct in some cases, so callers must check that this correct is for the + * register/fields they need. + * + * For SMN writes, success can be determined through a "write and read back" + * However, this is not robust when done here. + * + * Possible issues: + * + * 1) Bits that are "Write-1-to-Clear". In this case, the read value should + * *not* match the write value. + * + * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be + * known here. + * + * 3) Bits that are "Reserved / Set to 1". Ditto above. + * + * Callers of amd_smn_write() should do the "write and read back" check + * themselves, if needed. + * + * For #1, they can see if their target bits got cleared. + * + * For #2 and #3, they can check if their target bits got set as intended. + * + * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then + * the operation is considered a success, and the caller does their own + * checking. + */ +static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_num_nodes()) + return err; + + root = amd_roots[node]; + if (!root) + return err; + + if (!smn_exclusive) + return err; + + guard(mutex)(&smn_mutex); + + err = pci_write_config_dword(root, i_off, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + return pcibios_err_to_errno(err); + } + + err = (write ? pci_write_config_dword(root, d_off, *value) + : pci_read_config_dword(root, d_off, value)); + + return pcibios_err_to_errno(err); +} + +int __must_check amd_smn_read(u16 node, u32 address, u32 *value) +{ + int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false); + + if (PCI_POSSIBLE_ERROR(*value)) { + err = -ENODEV; + *value = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int __must_check amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +int __must_check amd_smn_hsmp_rdwr(u16 node, u32 address, u32 *value, bool write) +{ + return __amd_smn_rw(HSMP_INDEX_OFFSET, HSMP_DATA_OFFSET, node, address, value, write); +} +EXPORT_SYMBOL_GPL(amd_smn_hsmp_rdwr); + +static struct dentry *debugfs_dir; +static u16 debug_node; +static u32 debug_address; + +static ssize_t smn_node_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u16 node; + int ret; + + ret = kstrtou16_from_user(userbuf, count, 0, &node); + if (ret) + return ret; + + if (node >= amd_num_nodes()) + return -ENODEV; + + debug_node = node; + return count; +} + +static int smn_node_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_node); + return 0; +} + +static ssize_t smn_address_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &debug_address); + if (ret) + return ret; + + return count; +} + +static int smn_address_show(struct seq_file *m, void *v) +{ + seq_printf(m, "0x%08x\n", debug_address); + return 0; +} + +static int smn_value_show(struct seq_file *m, void *v) +{ + u32 val; + int ret; + + ret = amd_smn_read(debug_node, debug_address, &val); + if (ret) + return ret; + + seq_printf(m, "0x%08x\n", val); + return 0; +} + +static ssize_t smn_value_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + u32 val; + int ret; + + ret = kstrtouint_from_user(userbuf, count, 0, &val); + if (ret) + return ret; + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + ret = amd_smn_write(debug_node, debug_address, val); + if (ret) + return ret; + + return count; +} + +DEFINE_SHOW_STORE_ATTRIBUTE(smn_node); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_address); +DEFINE_SHOW_STORE_ATTRIBUTE(smn_value); + +static int amd_cache_roots(void) +{ + u16 node, num_nodes = amd_num_nodes(); + + amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL); + if (!amd_roots) + return -ENOMEM; + + for (node = 0; node < num_nodes; node++) + amd_roots[node] = amd_node_get_root(node); + + return 0; +} + +static int reserve_root_config_spaces(void) +{ + struct pci_dev *root = NULL; + struct pci_bus *bus = NULL; + + while ((bus = pci_find_next_bus(bus))) { + /* Root device is Device 0 Function 0 on each Primary Bus. */ + root = pci_get_slot(bus, 0); + if (!root) + continue; + + if (root->vendor != PCI_VENDOR_ID_AMD && + root->vendor != PCI_VENDOR_ID_HYGON) + continue; + + pci_dbg(root, "Reserving PCI config space\n"); + + /* + * There are a few SMN index/data pairs and other registers + * that shouldn't be accessed by user space. + * So reserve the entire PCI config space for simplicity rather + * than covering specific registers piecemeal. + */ + if (!pci_request_config_region_exclusive(root, 0, PCI_CFG_SPACE_SIZE, NULL)) { + pci_err(root, "Failed to reserve config space\n"); + return -EEXIST; + } + } + + smn_exclusive = true; + return 0; +} + +static bool enable_dfs; + +static int __init amd_smn_enable_dfs(char *str) +{ + enable_dfs = true; + return 1; +} +__setup("amd_smn_debugfs_enable", amd_smn_enable_dfs); + +static int __init amd_smn_init(void) +{ + int err; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + guard(mutex)(&smn_mutex); + + if (amd_roots) + return 0; + + err = amd_cache_roots(); + if (err) + return err; + + err = reserve_root_config_spaces(); + if (err) + return err; + + if (enable_dfs) { + debugfs_dir = debugfs_create_dir("amd_smn", arch_debugfs_dir); + + debugfs_create_file("node", 0600, debugfs_dir, NULL, &smn_node_fops); + debugfs_create_file("address", 0600, debugfs_dir, NULL, &smn_address_fops); + debugfs_create_file("value", 0600, debugfs_dir, NULL, &smn_value_fops); + } + + return 0; +} + +fs_initcall(amd_smn_init); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 89c0c8a3fc7e..769321185a08 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -29,7 +29,7 @@ #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3bf0487cf3b7..52d1808ee360 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o endif -# APIC probe will depend on the listing order here -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o - # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a42d8a6f7149..d73ba5a7b623 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -59,6 +59,7 @@ #include <asm/time.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/tsc.h> #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> @@ -425,7 +426,7 @@ static int lapic_next_deadline(unsigned long delta, weak_wrmsr_fence(); tsc = rdtsc(); - wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } @@ -440,7 +441,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, v); - apic_write(APIC_TMICT, 0); + + /* + * Setting APIC_LVT_MASKED (above) should be enough to tell + * the hardware that this timer will never fire. But AMD + * erratum 411 and some Intel CPU behavior circa 2024 say + * otherwise. Time for belt and suspenders programming: mask + * the timer _and_ zero the counter registers: + */ + if (v & APIC_LVT_TIMER_TSCDEADLINE) + wrmsrq(MSR_IA32_TSC_DEADLINE, 0); + else + apic_write(APIC_TMICT, 0); + return 0; } @@ -497,32 +510,32 @@ static struct clock_event_device lapic_clockevent = { static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static const struct x86_cpu_id deadline_match[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */ - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */ + X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */ - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020), + X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c), + X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014), - X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014), + X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20), - X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17), + X86_MATCH_VFM(INTEL_HASWELL, 0x22), + X86_MATCH_VFM(INTEL_HASWELL_L, 0x20), + X86_MATCH_VFM(INTEL_HASWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25), - X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17), + X86_MATCH_VFM(INTEL_BROADWELL, 0x25), + X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2), + X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52), - X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52), + X86_MATCH_VFM(INTEL_KABYLAKE, 0x52), {}, }; @@ -631,7 +644,7 @@ void lapic_update_tsc_freq(void) static __initdata int lapic_cal_loops = -1; static __initdata long lapic_cal_t1, lapic_cal_t2; static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata u32 lapic_cal_pm1, lapic_cal_pm2; static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; /* @@ -641,7 +654,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) { unsigned long long tsc = 0; long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); + u32 pm = acpi_pm_read_early(); if (boot_cpu_has(X86_FEATURE_TSC)) tsc = rdtsc(); @@ -666,7 +679,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } static int __init -calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) +calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc) { const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; const long pm_thresh = pm_100ms / 100; @@ -677,7 +690,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return -1; #endif - apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); + apic_pr_verbose("... PM-Timer delta = %u\n", deltapm); /* Check, if the PM timer is available */ if (!deltapm) @@ -687,14 +700,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) if (deltapm > (pm_100ms - pm_thresh) && deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + apic_pr_verbose("... PM-Timer result ok\n"); return 0; } res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warn("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n", (long)res); + pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n", + (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -707,9 +720,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) if (boot_cpu_has(X86_FEATURE_TSC)) { res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); - apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld)\n", - (unsigned long)res, *deltatsc); + apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -792,8 +804,7 @@ static int __init calibrate_APIC_clock(void) * in the clockevent structure and return. */ if (!lapic_init_clockevent()) { - apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_period); + apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period); /* * Direct calibration methods must have an always running * local APIC timer, no need for broadcast timer. @@ -802,8 +813,7 @@ static int __init calibrate_APIC_clock(void) return 0; } - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); + apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n"); /* * There are platforms w/o global clockevent devices. Instead of @@ -866,7 +876,7 @@ static int __init calibrate_APIC_clock(void) /* Build delta t1-t2 as apic timer counts down */ delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + apic_pr_verbose("... lapic delta = %ld\n", delta); deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); @@ -877,22 +887,19 @@ static int __init calibrate_APIC_clock(void) lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; lapic_init_clockevent(); - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - lapic_timer_period); + apic_pr_verbose("..... delta %ld\n", delta); + apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult); + apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); } - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - lapic_timer_period / (1000000 / HZ), - lapic_timer_period % (1000000 / HZ)); + apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n", + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result @@ -911,7 +918,7 @@ static int __init calibrate_APIC_clock(void) * available. */ if (!pm_referenced && global_clock_event) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + apic_pr_verbose("... verify APIC timer\n"); /* * Setup the apic timer manually @@ -932,11 +939,11 @@ static int __init calibrate_APIC_clock(void) /* Jiffies delta */ deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + apic_pr_verbose("... jiffies delta = %lu\n", deltaj); /* Check, if the jiffies result is consistent */ if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + apic_pr_verbose("... jiffies result ok\n"); else levt->features |= CLOCK_EVT_FEAT_DUMMY; } @@ -1221,9 +1228,8 @@ void __init sync_Arb_IDs(void) */ apic_wait_icr_idle(); - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_pr_debug("Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } enum apic_intr_mode_id apic_intr_mode __ro_after_init; @@ -1366,8 +1372,6 @@ void __init apic_intr_mode_init(void) x86_64_probe_apic(); - x86_32_install_bigsmp(); - if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1409,10 +1413,10 @@ static void lapic_setup_esr(void) if (maxlvt > 3) apic_write(APIC_ESR, 0); value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08x after: 0x%08x\n", - oldvalue, value); + if (value != oldvalue) { + apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n", + oldvalue, value); + } } #define APIC_IR_REGS APIC_ISR_NR @@ -1599,10 +1603,10 @@ static void setup_local_APIC(void) value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; if (!cpu && (pic_mode || !value || ioapic_is_disabled)) { value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu); } apic_write(APIC_LVT0, value); @@ -1669,7 +1673,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } topology_register_boot_apic(boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC @@ -1687,12 +1690,12 @@ static int x2apic_state; static bool x2apic_hw_locked(void) { - u64 ia32_cap; + u64 x86_arch_cap_msr; u64 msr; - ia32_cap = x86_read_arch_cap_msr(); - if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) { - rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { + rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } return false; @@ -1705,12 +1708,12 @@ static void __x2apic_disable(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (!(msr & X2APIC_ENABLE)) return; /* Disable xapic and x2apic first and then reenable xapic mode */ - wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); - wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); printk_once(KERN_INFO "x2apic disabled\n"); } @@ -1718,10 +1721,10 @@ static void __x2apic_enable(void) { u64 msr; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (msr & X2APIC_ENABLE) return; - wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); printk_once(KERN_INFO "x2apic enabled\n"); } @@ -1771,16 +1774,13 @@ void x2apic_setup(void) __x2apic_enable(); } -static __init void apic_set_fixmap(void); +static __init void apic_set_fixmap(bool read_apic); static __init void x2apic_disable(void) { - u32 x2apic_id, state = x2apic_state; + u32 x2apic_id; - x2apic_mode = 0; - x2apic_state = X2APIC_DISABLED; - - if (state != X2APIC_ON) + if (x2apic_state < X2APIC_ON) return; x2apic_id = read_apic_id(); @@ -1793,7 +1793,16 @@ static __init void x2apic_disable(void) } __x2apic_disable(); - apic_set_fixmap(); + + x2apic_mode = 0; + x2apic_state = X2APIC_DISABLED; + + /* + * Don't reread the APIC ID as it was already done from + * check_x2apic() and the APIC driver still is a x2APIC variant, + * which fails to do the read after x2APIC was disabled. + */ + apic_set_fixmap(false); } static __init void x2apic_enable(void) @@ -2003,8 +2012,8 @@ static bool __init detect_init_APIC(void) case X86_VENDOR_HYGON: break; case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC))) + if ((boot_cpu_data.x86 == 5 && boot_cpu_has(X86_FEATURE_APIC)) || + boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) break; goto no_apic; default: @@ -2057,13 +2066,13 @@ void __init init_apic_mappings(void) } } -static __init void apic_set_fixmap(void) +static __init void apic_set_fixmap(bool read_apic) { set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); apic_mmio_base = APIC_BASE; - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - apic_mmio_base, mp_lapic_addr); - apic_read_boot_cpu_id(false); + apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr); + if (read_apic) + apic_read_boot_cpu_id(false); } void __init register_lapic_address(unsigned long address) @@ -2073,7 +2082,7 @@ void __init register_lapic_address(unsigned long address) mp_lapic_addr = address; if (!x2apic_mode) - apic_set_fixmap(); + apic_set_fixmap(true); } /* @@ -2164,18 +2173,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt) apic_eoi(); atomic_inc(&irq_err_count); - apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", - smp_processor_id(), v); + apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v); v &= 0xff; while (v) { if (v & 0x1) - apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + apic_pr_debug_cont(" : %s", error_interrupt_reason[i]); i++; v >>= 1; } - apic_printk(APIC_DEBUG, KERN_CONT "\n"); + apic_pr_debug_cont("\n"); trace_error_apic_exit(ERROR_APIC_VECTOR); } @@ -2195,8 +2203,7 @@ static void __init connect_bsp_APIC(void) * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's * local APIC to INT and NMI lines. */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); + apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n"); imcr_pic_to_apic(); } #endif @@ -2221,8 +2228,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) * IPIs, won't work beyond this point! The only exception are * INIT IPIs. */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); + apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n"); imcr_apic_to_pic(); return; } @@ -2574,19 +2580,12 @@ int apic_is_clustered_box(void) /* * APIC command line parameters */ -static int __init setup_disableapic(char *arg) +static int __init setup_nolapic(char *arg) { apic_is_disabled = true; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f37ad3392fec..e0308d8c4e6c 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -8,129 +8,25 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/cpumask.h> #include <linux/export.h> -#include <linux/acpi.h> -#include <asm/jailhouse_para.h> #include <asm/apic.h> #include "local.h" -static struct apic apic_physflat; -static struct apic apic_flat; - -struct apic *apic __ro_after_init = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ - return 1; -} - -static void _flat_send_IPI_mask(unsigned long mask, int vector) -{ - unsigned long flags; - - local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); - local_irq_restore(flags); -} - -static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - - _flat_send_IPI_mask(mask, vector); -} - -static void -flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) -{ - unsigned long mask = cpumask_bits(cpumask)[0]; - int cpu = smp_processor_id(); - - if (cpu < BITS_PER_LONG) - __clear_bit(cpu, &mask); - - _flat_send_IPI_mask(mask, vector); -} - -static u32 flat_get_apic_id(u32 x) +static u32 physflat_get_apic_id(u32 x) { return (x >> 24) & 0xFF; } -static int flat_probe(void) +static int physflat_probe(void) { return 1; } -static struct apic apic_flat __ro_after_init = { - .name = "flat", - .probe = flat_probe, - .acpi_madt_oem_check = flat_acpi_madt_oem_check, - - .dest_mode_logical = true, - - .disable_esr = 0, - - .init_apic_ldr = default_init_apic_ldr, - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, - - .calc_dest_apicid = apic_flat_calc_apicid, - - .send_IPI = default_send_IPI_single, - .send_IPI_mask = flat_send_IPI_mask, - .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, - .send_IPI_allbutself = default_send_IPI_allbutself, - .send_IPI_all = default_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - .nmi_to_offline_cpu = true, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -/* - * Physflat mode is used when there are more than 8 CPUs on a system. - * We cannot use logical delivery in this case because the mask - * overflows, so use physical mode. - */ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { - printk(KERN_DEBUG "system APIC only can use physical flat"); - return 1; - } - - if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { - printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); - return 1; - } -#endif - - return 0; -} - -static int physflat_probe(void) -{ - return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt(); + return 1; } static struct apic apic_physflat __ro_after_init = { @@ -146,7 +42,7 @@ static struct apic apic_physflat __ro_after_init = { .cpu_present_to_apicid = default_cpu_present_to_apicid, .max_apic_id = 0xFE, - .get_apic_id = flat_get_apic_id, + .get_apic_id = physflat_get_apic_id, .calc_dest_apicid = apic_default_calc_apicid, @@ -166,8 +62,7 @@ static struct apic apic_physflat __ro_after_init = { .wait_icr_idle = apic_mem_wait_icr_idle, .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, }; +apic_driver(apic_physflat); -/* - * We need to check for physflat first, so this order is important. - */ -apic_drivers(apic_physflat, apic_flat); +struct apic *apic __ro_after_init = &apic_physflat; +EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index b5bb7a2e8340..58abb941c45b 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -27,7 +27,13 @@ static void noop_send_IPI_allbutself(int vector) { } static void noop_send_IPI_all(int vector) { } static void noop_send_IPI_self(int vector) { } static void noop_apic_icr_write(u32 low, u32 id) { } -static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; } + +static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, + unsigned int cpu) +{ + return -1; +} + static u64 noop_apic_icr_read(void) { return 0; } static u32 noop_get_apic_id(u32 apicid) { return 0; } static void noop_apic_eoi(void) { } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 16410f087b7a..5c5be2d58242 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/pgtable.h> +#include <asm/msr.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> @@ -31,7 +32,7 @@ static u32 numachip1_get_apic_id(u32 x) unsigned int id = (x >> 24) & 0xff; if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, value); + rdmsrq(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -42,7 +43,7 @@ static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; - rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg); return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } @@ -56,7 +57,7 @@ static void numachip2_apic_icr_write(int apicid, unsigned int val) numachip2_write32_lcsr(NUMACHIP2_APIC_ICR, (apicid << 12) | val); } -static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) +static int numachip_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu) { numachip_apic_icr_write(phys_apicid, APIC_DM_INIT); numachip_apic_icr_write(phys_apicid, APIC_DM_STARTUP | @@ -150,7 +151,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) /* Account for nodes per socket in multi-core-module processors */ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, val); + rdmsrq(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c deleted file mode 100644 index 9285d500d5b4..000000000000 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. - * - * Drives the local APIC in "clustered mode". - */ -#include <linux/cpumask.h> -#include <linux/dmi.h> -#include <linux/smp.h> - -#include <asm/apic.h> -#include <asm/io_apic.h> - -#include "local.h" - -static u32 bigsmp_get_apic_id(u32 x) -{ - return (x >> 24) & 0xFF; -} - -static void bigsmp_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void bigsmp_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - -static int dmi_bigsmp; /* can be set by dmi scanners */ - -static int hp_ht_bigsmp(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); - dmi_bigsmp = 1; - - return 0; -} - - -static const struct dmi_system_id bigsmp_dmi_table[] = { - { hp_ht_bigsmp, "HP ProLiant DL760 G2", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P44-"), - } - }, - - { hp_ht_bigsmp, "HP ProLiant DL740", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P47-"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - -static int probe_bigsmp(void) -{ - return dmi_check_system(bigsmp_dmi_table); -} - -static struct apic apic_bigsmp __ro_after_init = { - - .name = "bigsmp", - .probe = probe_bigsmp, - - .dest_mode_logical = false, - - .disable_esr = 1, - - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = bigsmp_get_apic_id, - - .calc_dest_apicid = apic_default_calc_apicid, - - .send_IPI = default_send_IPI_single_phys, - .send_IPI_mask = default_send_IPI_mask_sequence_phys, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = bigsmp_send_IPI_allbutself, - .send_IPI_all = bigsmp_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -bool __init apic_bigsmp_possible(bool cmdline_override) -{ - return apic == &apic_bigsmp || !cmdline_override; -} - -void __init apic_bigsmp_force(void) -{ - if (apic != &apic_bigsmp) - apic_install_driver(&apic_bigsmp); -} - -apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 477b740b2f26..5ba2feb2c04c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -86,8 +86,8 @@ static unsigned int ioapic_dynirq_base; static int ioapic_initialized; struct irq_pin_list { - struct list_head list; - int apic, pin; + struct list_head list; + int apic, pin; }; struct mp_chip_data { @@ -96,7 +96,7 @@ struct mp_chip_data { bool is_level; bool active_low; bool isa_irq; - u32 count; + u32 count; }; struct mp_ioapic_gsi { @@ -105,21 +105,17 @@ struct mp_ioapic_gsi { }; static struct ioapic { - /* - * # of IRQ routing registers - */ - int nr_registers; - /* - * Saved state during suspend/resume, or while enabling intr-remap. - */ - struct IO_APIC_route_entry *saved_registers; + /* # of IRQ routing registers */ + int nr_registers; + /* Saved state during suspend/resume, or while enabling intr-remap. */ + struct IO_APIC_route_entry *saved_registers; /* I/O APIC config */ - struct mpc_ioapic mp_config; + struct mpc_ioapic mp_config; /* IO APIC gsi routing info */ - struct mp_ioapic_gsi gsi_config; - struct ioapic_domain_cfg irqdomain_cfg; - struct irq_domain *irqdomain; - struct resource *iomem_res; + struct mp_ioapic_gsi gsi_config; + struct ioapic_domain_cfg irqdomain_cfg; + struct irq_domain *irqdomain; + struct resource *iomem_res; } ioapics[MAX_IO_APICS]; #define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver @@ -205,10 +201,9 @@ void mp_save_irq(struct mpc_intsrc *m) { int i; - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, - m->srcbusirq, m->dstapic, m->dstirq); + apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); for (i = 0; i < mp_irq_entries; i++) { if (!memcmp(&mp_irqs[i], m, sizeof(*m))) @@ -269,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) static inline void io_apic_eoi(unsigned int apic, unsigned int vector) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); } unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) { struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); return readl(&io_apic->data); } @@ -300,14 +297,8 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - struct IO_APIC_route_entry entry; - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - entry = __ioapic_read_entry(apic, pin); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - return entry; + guard(raw_spinlock_irqsave)(&ioapic_lock); + return __ioapic_read_entry(apic, pin); } /* @@ -324,11 +315,8 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __ioapic_write_entry(apic, pin, e); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -339,12 +327,10 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) static void ioapic_mask_entry(int apic, int pin) { struct IO_APIC_route_entry e = { .masked = true }; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_write(apic, 0x10 + 2*pin, e.w1); io_apic_write(apic, 0x11 + 2*pin, e.w2); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -352,68 +338,39 @@ static void ioapic_mask_entry(int apic, int pin) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static int __add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) +static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin) { struct irq_pin_list *entry; - /* don't allow duplicates */ - for_each_irq_pin(entry, data->irq_2_pin) + /* Don't allow duplicates */ + for_each_irq_pin(entry, data->irq_2_pin) { if (entry->apic == apic && entry->pin == pin) - return 0; + return true; + } entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node); if (!entry) { - pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); - return -ENOMEM; + pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin); + return false; } + entry->apic = apic; entry->pin = pin; list_add_tail(&entry->list, &data->irq_2_pin); - - return 0; + return true; } static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin) { struct irq_pin_list *tmp, *entry; - list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) + list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) { if (entry->apic == apic && entry->pin == pin) { list_del(&entry->list); kfree(entry); return; } -} - -static void add_pin_to_irq_node(struct mp_chip_data *data, - int node, int apic, int pin) -{ - if (__add_pin_to_irq_node(data, node, apic, pin)) - panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, data->irq_2_pin) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - /* every one is different, right? */ - return; - } } - - /* old apic/pin didn't exist, so just add new ones */ - add_pin_to_irq_node(data, node, newapic, newpin); } static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, @@ -430,12 +387,12 @@ static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, } } +/* + * Synchronize the IO-APIC and the CPU by doing a dummy read from the + * IO-APIC + */ static void io_apic_sync(struct irq_pin_list *entry) { - /* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ struct io_apic __iomem *io_apic; io_apic = io_apic_base(entry->apic); @@ -445,11 +402,9 @@ static void io_apic_sync(struct irq_pin_list *entry) static void mask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); io_apic_modify_irq(data, true, &io_apic_sync); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) @@ -460,11 +415,9 @@ static void __unmask_ioapic(struct mp_chip_data *data) static void unmask_ioapic_irq(struct irq_data *irq_data) { struct mp_chip_data *data = irq_data->chip_data; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __unmask_ioapic(data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -492,30 +445,24 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) entry = entry1 = __ioapic_read_entry(apic, pin); - /* - * Mask the entry and change the trigger mode to edge. - */ + /* Mask the entry and change the trigger mode to edge. */ entry1.masked = true; entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); - /* - * Restore the previous level triggered entry. - */ + /* Restore the previous level triggered entry. */ __ioapic_write_entry(apic, pin, entry); } } static void eoi_ioapic_pin(int vector, struct mp_chip_data *data) { - unsigned long flags; struct irq_pin_list *entry; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) __eoi_ioapic_pin(entry->apic, entry->pin, vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) @@ -538,8 +485,6 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) } if (entry.irr) { - unsigned long flags; - /* * Make sure the trigger mode is set to level. Explicit EOI * doesn't clear the remote-IRR if the trigger mode is not @@ -549,9 +494,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry.is_level = true; ioapic_write_entry(apic, pin, entry); } - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); __eoi_ioapic_pin(apic, pin, entry.vector); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -586,28 +530,23 @@ static int pirq_entries[MAX_PIRQS] = { static int __init ioapic_pirq_setup(char *str) { - int i, max; - int ints[MAX_PIRQS+1]; + int i, max, ints[MAX_PIRQS+1]; get_options(str, ARRAY_SIZE(ints), ints); - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); + apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; if (ints[0] < MAX_PIRQS) max = ints[0]; for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ + apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]); + /* PIRQs are mapped upside down, usually */ pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; } return 1; } - __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ @@ -626,8 +565,7 @@ int save_ioapic_entries(void) } for_each_pin(apic, pin) - ioapics[apic].saved_registers[pin] = - ioapic_read_entry(apic, pin); + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } return err; @@ -668,8 +606,7 @@ int restore_ioapic_entries(void) continue; for_each_pin(apic, pin) - ioapic_write_entry(apic, pin, - ioapics[apic].saved_registers[pin]); + ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]); } return 0; } @@ -681,12 +618,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type) { int i; - for (i = 0; i < mp_irq_entries; i++) + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].irqtype == type && (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; + } return -1; } @@ -701,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) - return mp_irqs[i].dstirq; } return -1; @@ -717,8 +653,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].srcbus; - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].irqtype == type) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) && (mp_irqs[i].srcbusirq == irq)) break; } @@ -726,9 +661,10 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int ioapic_idx; - for_each_ioapic(ioapic_idx) + for_each_ioapic(ioapic_idx) { if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) return ioapic_idx; + } } return -1; @@ -769,8 +705,7 @@ static bool EISA_ELCR(unsigned int irq) unsigned int port = PIC_ELCR1 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); + apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq); return false; } @@ -947,9 +882,9 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, struct irq_alloc_info *info) { + int type = ioapics[ioapic].irqdomain_cfg.type; bool legacy = false; int irq = -1; - int type = ioapics[ioapic].irqdomain_cfg.type; switch (type) { case IOAPIC_DOMAIN_LEGACY: @@ -971,8 +906,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return -1; } - return __irq_domain_alloc_irqs(domain, irq, 1, - ioapic_alloc_attr_node(info), + return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), info, legacy, NULL); } @@ -986,13 +920,12 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be * multiple pins sharing the same legacy IRQ number when ACPI is disabled. */ -static int alloc_isa_irq_from_domain(struct irq_domain *domain, - int irq, int ioapic, int pin, +static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin, struct irq_alloc_info *info) { - struct mp_chip_data *data; struct irq_data *irq_data = irq_get_irq_data(irq); int node = ioapic_alloc_attr_node(info); + struct mp_chip_data *data; /* * Legacy ISA IRQ has already been allocated, just add pin to @@ -1002,13 +935,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, if (irq_data && irq_data->parent_data) { if (!mp_check_pin_attr(irq, info)) return -EBUSY; - if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic, - info->ioapic.pin)) + if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin)) return -ENOMEM; } else { info->flags |= X86_IRQ_ALLOC_LEGACY; - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, - NULL); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -1022,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, unsigned int flags, struct irq_alloc_info *info) { - int irq; - bool legacy = false; + struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); struct irq_alloc_info tmp; struct mp_chip_data *data; - struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + bool legacy = false; + int irq; if (!domain) return -ENOSYS; @@ -1046,7 +977,7 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, return -EINVAL; } - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (!(flags & IOAPIC_MAP_ALLOC)) { if (!legacy) { irq = irq_find_mapping(domain, pin); @@ -1067,8 +998,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin, data->count++; } } - mutex_unlock(&ioapic_mutex); - return irq; } @@ -1076,26 +1005,20 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags) { u32 gsi = mp_pin_to_gsi(ioapic, pin); - /* - * Debugging check, we are in big trouble if this message pops up! - */ + /* Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); #ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ + /* PCI IRQ command line redirection. Yes, limits are hardcoded. */ if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); + if (pirq_entries[pin - 16] != -1) { + if (!pirq_entries[pin - 16]) { + apic_pr_verbose("Disabling PIRQ%d\n", pin - 16); } else { int irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); + + apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq); return irq; } } @@ -1133,10 +1056,9 @@ void mp_unmap_irq(int irq) if (!data || data->isa_irq) return; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); if (--data->count == 0) irq_domain_free_irqs(irq, 1); - mutex_unlock(&ioapic_mutex); } /* @@ -1147,12 +1069,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int irq, i, best_ioapic = -1, best_idx = -1; - apic_printk(APIC_DEBUG, - "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); + apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, - "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } @@ -1197,8 +1117,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return -1; out: - return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, - IOAPIC_MAP_ALLOC); + return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC); } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); @@ -1209,17 +1128,16 @@ static void __init setup_IO_APIC_irqs(void) unsigned int ioapic, pin; int idx; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_pr_verbose("Init IO_APIC IRQs\n"); for_each_ioapic_pin(ioapic, pin) { idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - apic_printk(APIC_VERBOSE, - KERN_DEBUG " apic %d pin %d not connected\n", - mpc_ioapic_id(ioapic), pin); - else - pin_2_irq(idx, ioapic, pin, - ioapic ? 0 : IOAPIC_MAP_ALLOC); + if (idx < 0) { + apic_pr_verbose("apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic), pin); + } else { + pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC); + } } } @@ -1234,26 +1152,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) char buf[256]; int i; - printk(KERN_DEBUG "IOAPIC %d:\n", apic); + apic_dbg("IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { entry = ioapic_read_entry(apic, i); - snprintf(buf, sizeof(buf), - " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", - i, - entry.masked ? "disabled" : "enabled ", + snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", + i, entry.masked ? "disabled" : "enabled ", entry.is_level ? "level" : "edge ", entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); if (entry.ir_format) { - printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, - (entry.ir_index_15 << 15) | entry.ir_index_0_14, - entry.ir_zero); + apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf, + (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero); } else { - printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, - entry.dest_mode_logical ? "logical " : "physical", - entry.virt_destid_8_14, entry.destid_0_7, - entry.delivery_mode); + apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode); } } } @@ -1264,30 +1177,25 @@ static void __init print_IO_APIC(int ioapic_idx) union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - reg_01.raw = io_apic_read(ioapic_idx, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(ioapic_idx, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(ioapic_idx, 3); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %02X\n", - reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %02X\n", - reg_01.bits.version); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + } + + apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + apic_dbg(".... register #00: %08X\n", reg_00.raw); + apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID); + apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS); + apic_dbg(".... register #01: %08X\n", *(int *)®_01); + apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries); + apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1295,8 +1203,8 @@ static void __init print_IO_APIC(int ioapic_idx) * value, so ignore it if reg_02 == reg_01. */ if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + apic_dbg(".... register #02: %08X\n", reg_02.raw); + apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration); } /* @@ -1306,11 +1214,11 @@ static void __init print_IO_APIC(int ioapic_idx) */ if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + apic_dbg(".... register #03: %08X\n", reg_03.raw); + apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT); } - printk(KERN_DEBUG ".... IRQ redirection table:\n"); + apic_dbg(".... IRQ redirection table:\n"); io_apic_print_entries(ioapic_idx, reg_01.bits.entries); } @@ -1319,11 +1227,11 @@ void __init print_IO_APICs(void) int ioapic_idx; unsigned int irq; - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for_each_ioapic(ioapic_idx) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mpc_ioapic_id(ioapic_idx), - ioapics[ioapic_idx].nr_registers); + apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries); + for_each_ioapic(ioapic_idx) { + apic_dbg("number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers); + } /* * We are a bit conservative about what we expect. We have to @@ -1334,7 +1242,7 @@ void __init print_IO_APICs(void) for_each_ioapic(ioapic_idx) print_IO_APIC(ioapic_idx); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); + apic_dbg("IRQ to pin mappings:\n"); for_each_active_irq(irq) { struct irq_pin_list *entry; struct irq_chip *chip; @@ -1349,7 +1257,7 @@ void __init print_IO_APICs(void) if (list_empty(&data->irq_2_pin)) continue; - printk(KERN_DEBUG "IRQ%d ", irq); + apic_dbg("IRQ%d ", irq); for_each_irq_pin(entry, data->irq_2_pin) pr_cont("-> %d:%d", entry->apic, entry->pin); pr_cont("\n"); @@ -1363,8 +1271,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { - int i8259_apic, i8259_pin; - int apic, pin; + int i8259_apic, i8259_pin, apic, pin; if (ioapic_is_disabled) nr_ioapics = 0; @@ -1376,19 +1283,21 @@ void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. + /* + * If the interrupt line is enabled and in ExtInt mode I + * have found the pin where the i8259 is connected. */ - if (!entry.masked && - entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { + if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; - goto found_i8259; + break; } } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic + + /* + * Look to see what if the MP table has reported the ExtINT + * + * If we could not find the appropriate pin by looking at the ioapic * the i8259 probably is not connected the ioapic but give the * mptable a chance anyway. */ @@ -1396,29 +1305,24 @@ void __init enable_IO_APIC(void) i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + pr_warn("ExtINT not setup in hardware but reported by MP table\n"); ioapic_i8259.pin = i8259_pin; ioapic_i8259.apic = i8259_apic; } /* Complain if the MP table and the hardware disagree */ if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + pr_warn("ExtINT in hardware and MP table differ\n"); - /* - * Do not trust the IO-APIC being empty at bootup - */ + /* Do not trust the IO-APIC being empty at bootup */ clear_IO_APIC(); } void native_restore_boot_irq_mode(void) { /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. + * If the i8259 is routed through an IOAPIC Put that IOAPIC in + * virtual wire mode so legacy interrupts can be delivered. */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; @@ -1433,9 +1337,7 @@ void native_restore_boot_irq_mode(void) entry.destid_0_7 = apic_id & 0xFF; entry.virt_destid_8_14 = apic_id >> 8; - /* - * Add it to the IO-APIC irq-routing table: - */ + /* Add it to the IO-APIC irq-routing table */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } @@ -1464,7 +1366,6 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; unsigned char old_id; - unsigned long flags; int ioapic_idx, i; /* @@ -1478,9 +1379,8 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) */ for_each_ioapic(ioapic_idx) { /* Read the register 0 value */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic_idx, 0); old_id = mpc_ioapic_id(ioapic_idx); @@ -1508,47 +1408,42 @@ static void __init setup_ioapic_ids_from_mpc_nocheck(void) set_bit(i, phys_id_present_map); ioapics[ioapic_idx].mp_config.apicid = i; } else { - apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n", - mpc_ioapic_id(ioapic_idx)); + apic_pr_verbose("Setting %d in the phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map); } /* - * We need to adjust the IRQ routing table - * if the ID changed. + * We need to adjust the IRQ routing table if the ID + * changed. */ - if (old_id != mpc_ioapic_id(ioapic_idx)) - for (i = 0; i < mp_irq_entries; i++) + if (old_id != mpc_ioapic_id(ioapic_idx)) { + for (i = 0; i < mp_irq_entries; i++) { if (mp_irqs[i].dstapic == old_id) - mp_irqs[i].dstapic - = mpc_ioapic_id(ioapic_idx); + mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx); + } + } /* - * Update the ID register according to the right value - * from the MPC table if they are different. + * Update the ID register according to the right value from + * the MPC table if they are different. */ if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) continue; - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mpc_ioapic_id(ioapic_idx)); + apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_idx, 0, reg_00.raw); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic_idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic_idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_idx, 0); + } + /* Sanity check */ if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) pr_cont("could not set ID!\n"); else - apic_printk(APIC_VERBOSE, " ok.\n"); + apic_pr_verbose(" ok.\n"); } } @@ -1591,10 +1486,9 @@ static void __init delay_with_tsc(void) * 1 GHz == 40 jiffies */ do { - rep_nop(); + native_pause(); now = rdtsc(); - } while ((now - start) < 40000000000ULL / HZ && - time_before_eq(jiffies, end)); + } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } static void __init delay_without_tsc(void) @@ -1655,36 +1549,29 @@ static int __init timer_irq_works(void) * so we 'resend' these IRQs via IPIs, to the same CPU. It's much * better to do it this way as thus we do not have to be aware of * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... + * + * Edge triggered needs to resend any interrupt that was delayed but this + * is now handled in the device independent code. + * + * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to + * make sure that we get the edge. If it is already asserted for some + * reason, we need return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake an edge even if it + * isn't on the 8259A... */ static unsigned int startup_ioapic_irq(struct irq_data *data) { int was_pending = 0, irq = data->irq; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); if (irq < nr_legacy_irqs()) { legacy_pic->mask(irq); if (legacy_pic->irq_pending(irq)) was_pending = 1; } __unmask_ioapic(data->chip_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return was_pending; } @@ -1694,9 +1581,8 @@ atomic_t irq_mis_count; static bool io_apic_level_ack_pending(struct mp_chip_data *data) { struct irq_pin_list *entry; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); for_each_irq_pin(entry, data->irq_2_pin) { struct IO_APIC_route_entry e; int pin; @@ -1704,13 +1590,9 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) pin = entry->pin; e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (e.irr) { - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (e.irr) return true; - } } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return false; } @@ -1728,7 +1610,8 @@ static inline bool ioapic_prepare_move(struct irq_data *data) static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { if (unlikely(moveit)) { - /* Only migrate the irq if the ack has been received. + /* + * Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets * delayed going to ioapics, and if we reprogram the @@ -1911,18 +1794,16 @@ static void ioapic_configure_entry(struct irq_data *irqd) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } -static int ioapic_set_affinity(struct irq_data *irq_data, - const struct cpumask *mask, bool force) +static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force) { struct irq_data *parent = irq_data->parent_data; - unsigned long flags; int ret; ret = parent->chip->irq_set_affinity(parent, mask, force); - raw_spin_lock_irqsave(&ioapic_lock, flags); + + guard(raw_spinlock_irqsave)(&ioapic_lock); if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -1941,9 +1822,8 @@ static int ioapic_set_affinity(struct irq_data *irq_data, * * Verify that the corresponding Remote-IRR bits are clear. */ -static int ioapic_irq_get_chip_state(struct irq_data *irqd, - enum irqchip_irq_state which, - bool *state) +static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which, + bool *state) { struct mp_chip_data *mcd = irqd->chip_data; struct IO_APIC_route_entry rentry; @@ -1953,7 +1833,8 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, return -EINVAL; *state = false; - raw_spin_lock(&ioapic_lock); + + guard(raw_spinlock)(&ioapic_lock); for_each_irq_pin(p, mcd->irq_2_pin) { rentry = __ioapic_read_entry(p->apic, p->pin); /* @@ -1967,7 +1848,6 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, break; } } - raw_spin_unlock(&ioapic_lock); return 0; } @@ -1981,7 +1861,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE | + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED | IRQCHIP_AFFINITY_PRE_STARTUP, }; @@ -2008,14 +1888,13 @@ static inline void init_IO_APIC_traps(void) cfg = irq_cfg(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. + * Hmm.. We don't have an entry for this, so + * default to an old-fashioned 8259 interrupt if we + * can. Otherwise set the dummy interrupt chip. */ if (irq < nr_legacy_irqs()) legacy_pic->make_irq(irq); else - /* Strange. Oh, well.. */ irq_set_chip(irq, &no_irq_chip); } } @@ -2024,20 +1903,17 @@ static inline void init_IO_APIC_traps(void) /* * The local APIC irq-chip implementation: */ - static void mask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(struct irq_data *data) { - unsigned long v; + unsigned long v = apic_read(APIC_LVT0); - v = apic_read(APIC_LVT0); apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } @@ -2056,8 +1932,7 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } /* @@ -2069,9 +1944,9 @@ static void lapic_register_intr(int irq) */ static inline void __init unlock_ExtINT_logic(void) { - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + struct IO_APIC_route_entry entry0, entry1; + int apic, pin, i; u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); @@ -2131,10 +2006,10 @@ static int __init disable_timer_pin_setup(char *arg) } early_param("disable_timer_pin_1", disable_timer_pin_setup); -static int mp_alloc_timer_irq(int ioapic, int pin) +static int __init mp_alloc_timer_irq(int ioapic, int pin) { - int irq = -1; struct irq_domain *domain = mp_ioapic_irqdomain(ioapic); + int irq = -1; if (domain) { struct irq_alloc_info info; @@ -2142,21 +2017,36 @@ static int mp_alloc_timer_irq(int ioapic, int pin) ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0); info.devid = mpc_ioapic_id(ioapic); info.ioapic.pin = pin; - mutex_lock(&ioapic_mutex); + guard(mutex)(&ioapic_mutex); irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info); - mutex_unlock(&ioapic_mutex); } return irq; } +static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, data->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + return; + } + } + + /* Old apic/pin didn't exist, so just add a new one */ + add_pin_to_irq_node(data, node, newapic, newpin); +} + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { @@ -2194,9 +2084,8 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2240,13 +2129,10 @@ static inline void __init check_timer(void) panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); + pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n"); + pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2255,7 +2141,7 @@ static inline void __init check_timer(void) irq_domain_activate_irq(irq_data, false); legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + pr_info("....... works.\n"); goto out; } /* @@ -2263,26 +2149,24 @@ static inline void __init check_timer(void) */ legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + pr_info("....... failed.\n"); } - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); + pr_info("...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ legacy_pic->unmask(0); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + pr_info("..... failed.\n"); - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); + pr_info("...trying to set up timer as ExtINT IRQ...\n"); legacy_pic->init(0); legacy_pic->make_irq(0); @@ -2292,14 +2176,15 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + pr_info("..... works.\n"); goto out; } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - if (apic_is_x2apic_enabled()) - apic_printk(APIC_QUIET, KERN_INFO - "Perhaps problem with the pre-enabled x2apic mode\n" - "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + + pr_info("..... failed :\n"); + if (apic_is_x2apic_enabled()) { + pr_info("Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + } panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: @@ -2327,11 +2212,11 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_domain *parent; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; - struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); + struct irq_domain *parent; struct fwnode_handle *fn; struct irq_fwspec fwspec; @@ -2340,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic) /* Handle device tree enumerated APICs proper */ if (cfg->dev) { - fn = of_node_to_fwnode(cfg->dev); + fn = of_fwnode_handle(cfg->dev); } else { fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) @@ -2367,10 +2252,8 @@ static int mp_irqdomain_create(int ioapic) return -ENOMEM; } - if (cfg->type == IOAPIC_DOMAIN_LEGACY || - cfg->type == IOAPIC_DOMAIN_STRICT) - ioapic_dynirq_base = max(ioapic_dynirq_base, - gsi_cfg->gsi_end + 1); + if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT) + ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1); return 0; } @@ -2397,13 +2280,11 @@ void __init setup_IO_APIC(void) io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL; - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + apic_pr_verbose("ENABLING IO-APIC IRQs\n"); for_each_ioapic(ioapic) BUG_ON(mp_irqdomain_create(ioapic)); - /* - * Set up IO-APIC IRQ routing. - */ + /* Set up IO-APIC IRQ routing. */ x86_init.mpparse.setup_ioapic_ids(); sync_Arb_IDs(); @@ -2417,16 +2298,14 @@ void __init setup_IO_APIC(void) static void resume_ioapic_id(int ioapic_idx) { - unsigned long flags; union IO_APIC_reg_00 reg_00; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_00.raw = io_apic_read(ioapic_idx, 0); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); io_apic_write(ioapic_idx, 0, reg_00.raw); } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ioapic_resume(void) @@ -2440,8 +2319,8 @@ static void ioapic_resume(void) } static struct syscore_ops ioapic_syscore_ops = { - .suspend = save_ioapic_entries, - .resume = ioapic_resume, + .suspend = save_ioapic_entries, + .resume = ioapic_resume, }; static int __init ioapic_init_ops(void) @@ -2456,15 +2335,13 @@ device_initcall(ioapic_init_ops); static int io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; - unsigned long flags; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - /* The register returns the maximum index redir index - * supported, which is one less than the total number of redir - * entries. + /* + * The register returns the maximum index redir index supported, + * which is one less than the total number of redir entries. */ return reg_01.bits.entries + 1; } @@ -2494,16 +2371,14 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC); const u32 broadcast_id = 0xF; union IO_APIC_reg_00 reg_00; - unsigned long flags; int i = 0; /* Initialize the ID map */ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC)) copy_phys_cpu_present_map(apic_id_map); - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(ioapic, 0); if (apic_id >= broadcast_id) { pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n", @@ -2530,21 +2405,19 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + } /* Sanity check */ if (reg_00.bits.ID != apic_id) { - pr_err("IOAPIC[%d]: Unable to change apic_id!\n", - ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); return -1; } } - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); return apic_id; } @@ -2560,7 +2433,6 @@ static u8 io_apic_unique_id(int idx, u8 id) { union IO_APIC_reg_00 reg_00; DECLARE_BITMAP(used, 256); - unsigned long flags; u8 new_id; int i; @@ -2576,26 +2448,23 @@ static u8 io_apic_unique_id(int idx, u8 id) * Read the current id from the ioapic and keep it if * available. */ - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) + reg_00.raw = io_apic_read(idx, 0); + new_id = reg_00.bits.ID; if (!test_bit(new_id, used)) { - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Using reg apic_id %d instead of %d\n", - idx, new_id, id); + apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n", + idx, new_id, id); return new_id; } - /* - * Get the next free id and write it to the ioapic. - */ + /* Get the next free id and write it to the ioapic. */ new_id = find_first_zero_bit(used, 256); reg_00.bits.ID = new_id; - raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(idx, 0, reg_00.raw); - reg_00.raw = io_apic_read(idx, 0); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); + scoped_guard (raw_spinlock_irqsave, &ioapic_lock) { + io_apic_write(idx, 0, reg_00.raw); + reg_00.raw = io_apic_read(idx, 0); + } /* Sanity check */ BUG_ON(reg_00.bits.ID != new_id); @@ -2605,12 +2474,10 @@ static u8 io_apic_unique_id(int idx, u8 id) static int io_apic_get_version(int ioapic) { - union IO_APIC_reg_01 reg_01; - unsigned long flags; + union IO_APIC_reg_01 reg_01; - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); reg_01.raw = io_apic_read(ioapic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } @@ -2625,8 +2492,8 @@ static struct resource *ioapic_resources; static struct resource * __init ioapic_setup_resources(void) { - unsigned long n; struct resource *res; + unsigned long n; char *mem; int i; @@ -2636,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, SMP_CACHE_BYTES); - if (!mem) - panic("%s: Failed to allocate %lu bytes\n", __func__, n); + mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; @@ -2686,9 +2551,7 @@ void __init io_apic_init_mappings(void) ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " + pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, " "disabling IO/APIC support!\n"); smp_found_config = 0; ioapic_is_disabled = true; @@ -2699,17 +2562,13 @@ void __init io_apic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - if (!ioapic_phys) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), - ioapic_phys); + apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys); idx++; ioapic_res->start = ioapic_phys; @@ -2720,13 +2579,12 @@ fake_ioapic_page: void __init ioapic_insert_resources(void) { - int i; struct resource *r = ioapic_resources; + int i; if (!r) { if (nr_ioapics > 0) - printk(KERN_ERR - "IO APIC resources couldn't be allocated.\n"); + pr_err("IO APIC resources couldn't be allocated.\n"); return; } @@ -2746,11 +2604,12 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for_each_ioapic(i) { struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end) return i; } - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); return -1; } @@ -2789,12 +2648,10 @@ static int bad_ioapic_register(int idx) static int find_free_ioapic_entry(void) { - int idx; - - for (idx = 0; idx < MAX_IO_APICS; idx++) + for (int idx = 0; idx < MAX_IO_APICS; idx++) { if (ioapics[idx].nr_registers == 0) return idx; - + } return MAX_IO_APICS; } @@ -2805,8 +2662,7 @@ static int find_free_ioapic_entry(void) * @gsi_base: base of GSI associated with the IOAPIC * @cfg: configuration information for the IOAPIC */ -int mp_register_ioapic(int id, u32 address, u32 gsi_base, - struct ioapic_domain_cfg *cfg) +int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg) { bool hotplug = !!ioapic_initialized; struct mp_ioapic_gsi *gsi_cfg; @@ -2817,12 +2673,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, pr_warn("Bogus (zero) I/O APIC address found, skipping!\n"); return -EINVAL; } - for_each_ioapic(ioapic) + + for_each_ioapic(ioapic) { if (ioapics[ioapic].mp_config.apicaddr == address) { - pr_warn("address 0x%x conflicts with IOAPIC%d\n", - address, ioapic); + pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic); return -EEXIST; } + } idx = find_free_ioapic_entry(); if (idx >= MAX_IO_APICS) { @@ -2857,8 +2714,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, (gsi_end >= gsi_cfg->gsi_base && gsi_end <= gsi_cfg->gsi_end)) { pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n", - gsi_base, gsi_end, - gsi_cfg->gsi_base, gsi_cfg->gsi_end); + gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end); clear_fixmap(FIX_IO_APIC_BASE_0 + idx); return -ENOSPC; } @@ -2892,8 +2748,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base, ioapics[idx].nr_registers = entries; pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", - idx, mpc_ioapic_id(idx), - mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), gsi_cfg->gsi_base, gsi_cfg->gsi_end); return 0; @@ -2904,11 +2759,13 @@ int mp_unregister_ioapic(u32 gsi_base) int ioapic, pin; int found = 0; - for_each_ioapic(ioapic) + for_each_ioapic(ioapic) { if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) { found = 1; break; } + } + if (!found) { pr_warn("can't find IOAPIC for GSI %d\n", gsi_base); return -ENODEV; @@ -2922,8 +2779,7 @@ int mp_unregister_ioapic(u32 gsi_base) if (irq >= 0) { data = irq_get_chip_data(irq); if (data && data->count) { - pr_warn("pin%d on IOAPIC%d is still in use.\n", - pin, ioapic); + pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic); return -EBUSY; } } @@ -2958,8 +2814,7 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, if (info && info->ioapic.valid) { data->is_level = info->ioapic.is_level; data->active_low = info->ioapic.active_low; - } else if (__acpi_get_override_irq(gsi, &data->is_level, - &data->active_low) < 0) { + } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ data->is_level = true; data->active_low = true; @@ -3017,10 +2872,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -ENOMEM; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); - if (ret < 0) { - kfree(data); - return ret; - } + if (ret < 0) + goto free_data; INIT_LIST_HEAD(&data->irq_2_pin); irq_data->hwirq = info->ioapic.pin; @@ -3029,7 +2882,10 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); + if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) { + ret = -ENOMEM; + goto free_irqs; + } mp_preconfigure_entry(data); mp_register_handler(virq, data->is_level); @@ -3039,11 +2895,15 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, legacy_pic->mask(virq); local_irq_restore(flags); - apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", - ioapic, mpc_ioapic_id(ioapic), pin, virq, - data->is_level, data->active_low); + apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low); return 0; + +free_irqs: + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +free_data: + kfree(data); + return ret; } void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, @@ -3056,22 +2916,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, irq_data = irq_domain_get_irq_data(domain, virq); if (irq_data && irq_data->chip_data) { data = irq_data->chip_data; - __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); WARN_ON(!list_empty(&data->irq_2_pin)); kfree(irq_data->chip_data); } irq_domain_free_irqs_top(domain, virq, nr_irqs); } -int mp_irqdomain_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool reserve) +int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve) { - unsigned long flags; - - raw_spin_lock_irqsave(&ioapic_lock, flags); + guard(raw_spinlock_irqsave)(&ioapic_lock); ioapic_configure_entry(irq_data); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); return 0; } @@ -3079,8 +2934,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain, struct irq_data *irq_data) { /* It won't be called for IRQ with multiple IOAPIC pins associated */ - ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), - (int)irq_data->hwirq); + ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq); } int mp_irqdomain_ioapic_idx(struct irq_domain *domain) diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 5da693d633b7..98a57cb4aa86 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -3,6 +3,7 @@ #include <linux/cpumask.h> #include <linux/delay.h> #include <linux/smp.h> +#include <linux/string_choices.h> #include <asm/io_apic.h> @@ -23,7 +24,7 @@ __setup("no_ipi_broadcast=", apic_ipi_shorthand); static int __init print_ipi_mode(void) { pr_info("IPI shorthand broadcast: %s\n", - apic_ipi_shorthand_off ? "disabled" : "enabled"); + str_disabled_enabled(apic_ipi_shorthand_off)); return 0; } late_initcall(print_ipi_mode); @@ -287,34 +288,4 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } - -#ifdef CONFIG_SMP -static int convert_apicid_to_cpu(u32 apic_id) -{ - int i; - - for_each_possible_cpu(i) { - if (per_cpu(x86_cpu_to_apicid, i) == apic_id) - return i; - } - return -1; -} - -int safe_smp_processor_id(void) -{ - u32 apicid; - int cpuid; - - if (!boot_cpu_has(X86_FEATURE_APIC)) - return 0; - - apicid = read_apic_id(); - if (apicid == BAD_APICID) - return 0; - - cpuid = convert_apicid_to_cpu(apicid); - - return cpuid >= 0 ? cpuid : 0; -} -#endif #endif diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 842fe28496be..bdcf609eb283 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -65,17 +65,4 @@ void default_send_IPI_self(int vector); void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); -void x86_32_probe_bigsmp_early(void); -void x86_32_install_bigsmp(void); -#else -static inline void x86_32_probe_bigsmp_early(void) { } -static inline void x86_32_install_bigsmp(void) { } -#endif - -#ifdef CONFIG_X86_BIGSMP -bool apic_bigsmp_possible(bool cmdline_selected); -void apic_bigsmp_force(void); -#else -static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; -static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index d9651f15ae4f..66bc5d3e79db 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -184,7 +184,6 @@ static int x86_msi_prepare(struct irq_domain *domain, struct device *dev, alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; return 0; case DOMAIN_BUS_PCI_DEVICE_MSIX: - case DOMAIN_BUS_PCI_DEVICE_IMS: alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; return 0; default: @@ -215,6 +214,7 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, if (WARN_ON_ONCE(domain != real_parent)) return false; info->chip->irq_set_affinity = msi_set_affinity; + info->chip->flags |= IRQCHIP_MOVE_DEFERRED; break; case DOMAIN_BUS_DMAR: case DOMAIN_BUS_AMDVI: @@ -229,10 +229,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain, case DOMAIN_BUS_PCI_DEVICE_MSI: case DOMAIN_BUS_PCI_DEVICE_MSIX: break; - case DOMAIN_BUS_PCI_DEVICE_IMS: - if (!(pops->supported_flags & MSI_FLAG_PCI_IMS)) - return false; - break; default: WARN_ON_ONCE(1); return false; @@ -320,7 +316,7 @@ static struct irq_chip dmar_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE | + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED | IRQCHIP_AFFINITY_PRE_STARTUP, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f75ee345c02d..87bc9e7ca5d6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -93,35 +93,6 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init x86_32_probe_bigsmp_early(void) -{ - if (nr_cpu_ids <= 8 || xen_pv_domain()) - return; - - if (IS_ENABLED(CONFIG_X86_BIGSMP)) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(boot_cpu_apic_version)) - break; - /* P4 and above */ - fallthrough; - case X86_VENDOR_HYGON: - case X86_VENDOR_AMD: - if (apic_bigsmp_possible(cmdline_apic)) - return; - break; - } - } - pr_info("Limiting to 8 possible CPUs\n"); - set_nr_cpu_ids(8); -} - -void __init x86_32_install_bigsmp(void) -{ - if (nr_cpu_ids > 8 && !xen_pv_domain()) - apic_bigsmp_force(); -} - void __init x86_32_probe_apic(void) { if (!cmdline_apic) { diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 185738c72766..93069b13d3af 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -712,8 +712,8 @@ int __init arch_probe_nr_irqs(void) { int nr; - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; + if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids) + irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids); nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids; #if defined(CONFIG_PCI_MSI) @@ -725,8 +725,8 @@ int __init arch_probe_nr_irqs(void) else nr += gsi_top * 16; #endif - if (nr < nr_irqs) - nr_irqs = nr; + if (nr < irq_get_nr_irqs()) + irq_set_nr_irqs(nr); /* * We don't know if PIC is present at this point so we need to do @@ -799,7 +799,7 @@ int __init arch_early_irq_init(void) x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops, NULL); BUG_ON(x86_vector_domain == NULL); - irq_set_default_host(x86_vector_domain); + irq_set_default_domain(x86_vector_domain); BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL)); @@ -864,7 +864,7 @@ void lapic_offline(void) __vector_cleanup(cl, false); irq_matrix_offline(vector_matrix); - WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0); WARN_ON_ONCE(!hlist_empty(&cl->head)); unlock_vector_lock(); @@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd, return err ? err : IRQ_SET_MASK_OK; } +static void free_moved_vector(struct apic_chip_data *apicd) +{ + unsigned int vector = apicd->prev_vector; + unsigned int cpu = apicd->prev_cpu; + bool managed = apicd->is_managed; + + /* + * Managed interrupts are usually not migrated away + * from an online CPU, but CPU isolation 'managed_irq' + * can make that happen. + * 1) Activation does not take the isolation into account + * to keep the code simple + * 2) Migration away from an isolated CPU can happen when + * a non-isolated CPU which is in the calculated + * affinity mask comes online. + */ + trace_vector_free_moved(apicd->irq, cpu, vector, managed); + irq_matrix_free(vector_matrix, cpu, vector, managed); + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; + hlist_del_init(&apicd->clist); + apicd->prev_vector = 0; + apicd->move_in_progress = 0; +} + +/* + * Called from fixup_irqs() with @desc->lock held and interrupts disabled. + */ +static void apic_force_complete_move(struct irq_data *irqd) +{ + unsigned int cpu = smp_processor_id(); + struct apic_chip_data *apicd; + unsigned int vector; + + guard(raw_spinlock)(&vector_lock); + apicd = apic_chip_data(irqd); + if (!apicd) + return; + + /* + * If prev_vector is empty or the descriptor is neither currently + * nor previously on the outgoing CPU no action required. + */ + vector = apicd->prev_vector; + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu)) + return; + + /* + * This is tricky. If the cleanup of the old vector has not been + * done yet, then the following setaffinity call will fail with + * -EBUSY. This can leave the interrupt in a stale state. + * + * All CPUs are stuck in stop machine with interrupts disabled so + * calling __irq_complete_move() would be completely pointless. + * + * 1) The interrupt is in move_in_progress state. That means that we + * have not seen an interrupt since the io_apic was reprogrammed to + * the new vector. + * + * 2) The interrupt has fired on the new vector, but the cleanup IPIs + * have not been processed yet. + */ + if (apicd->move_in_progress) { + /* + * In theory there is a race: + * + * set_ioapic(new_vector) <-- Interrupt is raised before update + * is effective, i.e. it's raised on + * the old vector. + * + * So if the target cpu cannot handle that interrupt before + * the old vector is cleaned up, we get a spurious interrupt + * and in the worst case the ioapic irq line becomes stale. + * + * But in case of cpu hotplug this should be a non issue + * because if the affinity update happens right before all + * cpus rendezvous in stop machine, there is no way that the + * interrupt can be blocked on the target cpu because all cpus + * loops first with interrupts enabled in stop machine, so the + * old vector is not yet cleaned up when the interrupt fires. + * + * So the only way to run into this issue is if the delivery + * of the interrupt on the apic/system bus would be delayed + * beyond the point where the target cpu disables interrupts + * in stop machine. I doubt that it can happen, but at least + * there is a theoretical chance. Virtualization might be + * able to expose this, but AFAICT the IOAPIC emulation is not + * as stupid as the real hardware. + * + * Anyway, there is nothing we can do about that at this point + * w/o refactoring the whole fixup_irq() business completely. + * We print at least the irq number and the old vector number, + * so we have the necessary information when a problem in that + * area arises. + */ + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", + irqd->irq, vector); + } + free_moved_vector(apicd); +} + #else -# define apic_set_affinity NULL +# define apic_set_affinity NULL +# define apic_force_complete_move NULL #endif static int apic_retrigger_irq(struct irq_data *irqd) @@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data, } static struct irq_chip lapic_controller = { - .name = "APIC", - .irq_ack = apic_ack_edge, - .irq_set_affinity = apic_set_affinity, - .irq_compose_msi_msg = x86_vector_msi_compose_msg, - .irq_retrigger = apic_retrigger_irq, + .name = "APIC", + .irq_ack = apic_ack_edge, + .irq_set_affinity = apic_set_affinity, + .irq_compose_msi_msg = x86_vector_msi_compose_msg, + .irq_force_complete_move = apic_force_complete_move, + .irq_retrigger = apic_retrigger_irq, }; #ifdef CONFIG_SMP -static void free_moved_vector(struct apic_chip_data *apicd) -{ - unsigned int vector = apicd->prev_vector; - unsigned int cpu = apicd->prev_cpu; - bool managed = apicd->is_managed; - - /* - * Managed interrupts are usually not migrated away - * from an online CPU, but CPU isolation 'managed_irq' - * can make that happen. - * 1) Activation does not take the isolation into account - * to keep the code simple - * 2) Migration away from an isolated CPU can happen when - * a non-isolated CPU which is in the calculated - * affinity mask comes online. - */ - trace_vector_free_moved(apicd->irq, cpu, vector, managed); - irq_matrix_free(vector_matrix, cpu, vector, managed); - per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED; - hlist_del_init(&apicd->clist); - apicd->prev_vector = 0; - apicd->move_in_progress = 0; -} - static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) { struct apic_chip_data *apicd; @@ -965,7 +1043,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) lockdep_assert_held(&vector_lock); hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) { - unsigned int irr, vector = apicd->prev_vector; + unsigned int vector = apicd->prev_vector; /* * Paranoia: Check if the vector that needs to be cleaned @@ -979,8 +1057,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr) * fixup_irqs() was just called to scan IRR for set bits and * forward them to new destination CPUs via IPIs. */ - irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0; - if (irr & (1U << (vector % 32))) { + if (check_irr && is_vector_pending(vector)) { pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq); rearm = true; continue; @@ -1036,7 +1113,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd) add_timer_on(&cl->timer, cpu); } } else { - apicd->prev_vector = 0; + pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu); + free_moved_vector(apicd); } raw_spin_unlock(&vector_lock); } @@ -1068,97 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg) __vector_schedule_cleanup(apicd); } -/* - * Called from fixup_irqs() with @desc->lock held and interrupts disabled. - */ -void irq_force_complete_move(struct irq_desc *desc) -{ - struct apic_chip_data *apicd; - struct irq_data *irqd; - unsigned int vector; - - /* - * The function is called for all descriptors regardless of which - * irqdomain they belong to. For example if an IRQ is provided by - * an irq_chip as part of a GPIO driver, the chip data for that - * descriptor is specific to the irq_chip in question. - * - * Check first that the chip_data is what we expect - * (apic_chip_data) before touching it any further. - */ - irqd = irq_domain_get_irq_data(x86_vector_domain, - irq_desc_get_irq(desc)); - if (!irqd) - return; - - raw_spin_lock(&vector_lock); - apicd = apic_chip_data(irqd); - if (!apicd) - goto unlock; - - /* - * If prev_vector is empty, no action required. - */ - vector = apicd->prev_vector; - if (!vector) - goto unlock; - - /* - * This is tricky. If the cleanup of the old vector has not been - * done yet, then the following setaffinity call will fail with - * -EBUSY. This can leave the interrupt in a stale state. - * - * All CPUs are stuck in stop machine with interrupts disabled so - * calling __irq_complete_move() would be completely pointless. - * - * 1) The interrupt is in move_in_progress state. That means that we - * have not seen an interrupt since the io_apic was reprogrammed to - * the new vector. - * - * 2) The interrupt has fired on the new vector, but the cleanup IPIs - * have not been processed yet. - */ - if (apicd->move_in_progress) { - /* - * In theory there is a race: - * - * set_ioapic(new_vector) <-- Interrupt is raised before update - * is effective, i.e. it's raised on - * the old vector. - * - * So if the target cpu cannot handle that interrupt before - * the old vector is cleaned up, we get a spurious interrupt - * and in the worst case the ioapic irq line becomes stale. - * - * But in case of cpu hotplug this should be a non issue - * because if the affinity update happens right before all - * cpus rendezvous in stop machine, there is no way that the - * interrupt can be blocked on the target cpu because all cpus - * loops first with interrupts enabled in stop machine, so the - * old vector is not yet cleaned up when the interrupt fires. - * - * So the only way to run into this issue is if the delivery - * of the interrupt on the apic/system bus would be delayed - * beyond the point where the target cpu disables interrupts - * in stop machine. I doubt that it can happen, but at least - * there is a theoretical chance. Virtualization might be - * able to expose this, but AFAICT the IOAPIC emulation is not - * as stupid as the real hardware. - * - * Anyway, there is nothing we can do about that at this point - * w/o refactoring the whole fixup_irq() business completely. - * We print at least the irq number and the old vector number, - * so we have the necessary information when a problem in that - * area arises. - */ - pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n", - irqd->irq, vector); - } - free_moved_vector(apicd); -unlock: - raw_spin_unlock(&vector_lock); -} - #ifdef CONFIG_HOTPLUG_CPU /* * Note, this is not accurate accounting, but at least good enough to diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 567dbd2fe4b6..7db83212effb 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu) u32 phys_apicid = apic->cpu_present_to_apicid(cpu); u32 cluster = apic_cluster(phys_apicid); u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + int node = cpu_to_node(cpu); x86_cpu_to_logical_apicid[cpu] = logical_apicid; - if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) + if (alloc_clustermask(cpu, cluster, node) < 0) return -ENOMEM; - if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) + + if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node)) return -ENOMEM; + return 0; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7fef504ca508..15209f220e1f 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -667,7 +667,7 @@ static __init void build_uv_gr_table(void) } } -static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip) +static int uv_wakeup_secondary(u32 phys_apicid, unsigned long start_rip, unsigned int cpu) { unsigned long val; int pnode; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index a98020bf31bb..6259b474073b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -33,6 +33,14 @@ static void __used common(void) { + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); OFFSET(TASK_threadsp, task_struct, thread.sp); #ifdef CONFIG_STACKPROTECTOR @@ -107,11 +115,6 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); - OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack); - OFFSET(X86_current_task, pcpu_hot, current_task); -#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING - OFFSET(X86_call_depth, pcpu_hot, call_depth); -#endif #if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64) /* Offset for fields in aria_ctx */ BLANK(); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 2b411cd00a4e..e0a292db97b2 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -12,15 +12,6 @@ void foo(void); void foo(void) { - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); - OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); - OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); - OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); - OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); - OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bb65371ea9df..590b6cd0eac0 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -54,11 +54,5 @@ int main(void) BLANK(); #undef ENTRY - BLANK(); - -#ifdef CONFIG_STACKPROTECTOR - OFFSET(FIXED_stack_canary, fixed_percpu_data, stack_canary); - BLANK(); -#endif return 0; } diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 3fed7ae58b60..73274d76ce16 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -8,6 +8,7 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/acpi.h> +#include <linux/bitops.h> #include <asm/io.h> #include <linux/mc146818rtc.h> @@ -20,27 +21,13 @@ int sbf_port __initdata = -1; /* set via acpi_boot_init() */ -static int __init parity(u8 v) -{ - int x = 0; - int i; - - for (i = 0; i < 8; i++) { - x ^= (v & 1); - v >>= 1; - } - - return x; -} - static void __init sbf_write(u8 v) { unsigned long flags; if (sbf_port != -1) { - v &= ~SBF_PARITY; - if (!parity(v)) - v |= SBF_PARITY; + if (!parity8(v)) + v ^= SBF_PARITY; printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); @@ -66,14 +53,14 @@ static u8 __init sbf_read(void) return v; } -static int __init sbf_value_valid(u8 v) +static bool __init sbf_value_valid(u8 v) { if (v & SBF_RESERVED) /* Reserved bits */ - return 0; - if (!parity(v)) - return 0; + return false; + if (!parity8(v)) + return false; - return 1; + return true; } static int __init sbf_init(void) diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 30335182b6b0..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -98,11 +98,10 @@ static inline bool within_module_coretext(void *addr) #ifdef CONFIG_MODULES struct module *mod; - preempt_disable(); + guard(rcu)(); mod = __module_address((unsigned long)addr); if (mod && within_module_core((unsigned long)addr, mod)) ret = true; - preempt_enable(); #endif return ret; } @@ -139,14 +138,15 @@ static bool skip_addr(void *dest) return true; #endif #ifdef CONFIG_KEXEC_CORE +# ifdef CONFIG_X86_64 + if (dest >= (void *)__relocate_kernel_start && + dest < (void *)__relocate_kernel_end) + return true; +# else if (dest >= (void *)relocate_kernel && dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE) return true; -#endif -#ifdef CONFIG_XEN - if (dest >= (void *)hypercall_page && - dest < (void*)hypercall_page + PAGE_SIZE) - return true; +# endif #endif return false; } @@ -185,8 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, tsize, pad, - skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -240,21 +239,10 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct) } static __init_or_module void -patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end, - const struct core_text *ct) -{ - struct alt_instr *a; - - for (a = start; a < end; a++) - patch_call((void *)&a->instr_offset + a->instr_offset, ct); -} - -static __init_or_module void callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct) { prdbg("Patching call sites %s\n", ct->name); patch_call_sites(cs->call_start, cs->call_end, ct); - patch_alt_call_sites(cs->alt_start, cs->alt_end, ct); prdbg("Patching call sites done%s\n", ct->name); } @@ -263,8 +251,6 @@ void __init callthunks_patch_builtin_calls(void) struct callthunk_sites cs = { .call_start = __call_sites, .call_end = __call_sites_end, - .alt_start = __alt_instructions, - .alt_end = __alt_instructions_end }; if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) @@ -308,13 +294,12 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, tmpl_size, pad, - skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } -int x86_call_depth_emit_accounting(u8 **pprog, void *func) +int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) { unsigned int tmpl_size = SKL_TMPL_SIZE; u8 insn_buff[MAX_PATCH_LEN]; @@ -327,8 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, tmpl_size, *pprog, - skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index d2c732a34e5d..99444409c026 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -2,6 +2,7 @@ #include <linux/ptrace.h> #include <asm/bugs.h> +#include <asm/msr.h> #include <asm/traps.h> enum cp_error_code { @@ -55,7 +56,7 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) * will be whatever is live in userspace. So read the SSP before enabling * interrupts so locking the fpregs to do it later is not required. */ - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); cond_local_irq_enable(regs); @@ -81,6 +82,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) static __ro_after_init bool ibt_fatal = true; +/* + * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR. + * + * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered, + * the WFE state of the interrupted context needs to be cleared to let execution + * continue. Otherwise when the CPU resumes from the instruction that just + * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU + * enters a dead loop. + * + * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't + * set WFE. But FRED provides space on the entry stack (in an expanded CS area) + * to save and restore the WFE state, thus the WFE state is no longer clobbered, + * so software must clear it. + */ +static void ibt_clear_fred_wfe(struct pt_regs *regs) +{ + /* + * No need to do any FRED checks. + * + * For IDT event delivery, the high-order 48 bits of CS are pushed + * as 0s into the stack, and later IRET ignores these bits. + * + * For FRED, a test to check if fred_cs.wfe is set would be dropped + * by compilers. + */ + regs->fred_cs.wfe = 0; +} + static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) { if ((error_code & CP_EC) != CP_ENDBR) { @@ -90,6 +119,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) { regs->ax = 0; + ibt_clear_fred_wfe(regs); return; } @@ -97,6 +127,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code) if (!ibt_fatal) { printk(KERN_DEFAULT CUT_HERE); __warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL); + ibt_clear_fred_wfe(regs); return; } BUG(); diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c index e6bf78fac146..77086cf565ec 100644 --- a/arch/x86/kernel/cfi.c +++ b/arch/x86/kernel/cfi.c @@ -67,16 +67,30 @@ static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, */ enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { - unsigned long target; + unsigned long target, addr = regs->ip; u32 type; - if (!is_cfi_trap(regs->ip)) - return BUG_TRAP_TYPE_NONE; + switch (cfi_mode) { + case CFI_KCFI: + if (!is_cfi_trap(addr)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, addr); + + break; - if (!decode_cfi_insn(regs, &target, &type)) - return report_cfi_failure_noaddr(regs, regs->ip); + case CFI_FINEIBT: + if (!decode_fineibt_insn(regs, &target, &type)) + return BUG_TRAP_TYPE_NONE; + + break; + + default: + return BUG_TRAP_TYPE_NONE; + } - return report_cfi_failure(regs, regs->ip, &target, type); + return report_cfi_failure(regs, addr, &target, type); } /* diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index eb4dbcdf41f1..1e26179ff18c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,7 +24,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o -obj-y += cpuid-deps.o +obj-y += cpuid-deps.o cpuid_0x2_table.o obj-y += umwait.o obj-y += capflags.o powerflags.o @@ -34,10 +34,13 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o tsx.o +obj-y += intel.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o +ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy) +obj-y += amd_cache_disable.o +endif obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o @@ -59,8 +62,10 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o obj-$(CONFIG_DEBUG_FS) += debugfs.o +obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o + quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^ + cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^ cpufeature = $(src)/../../include/asm/cpufeatures.h vmxfeature = $(src)/../../include/asm/vmxfeatures.h diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6d8677e80ddb..b2ad8d13211a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,10 +9,12 @@ #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> +#include <asm/amd/fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> #include <asm/cpu.h> +#include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/numa.h> @@ -20,6 +22,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/sev.h> #ifdef CONFIG_X86_64 @@ -28,7 +31,9 @@ #include "cpu.h" -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +u16 invlpgb_count_max __ro_after_init = 1; + +static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { u32 gprs[8] = { 0 }; int err; @@ -46,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +static inline int wrmsrq_amd_safe(unsigned msr, u64 val) { u32 gprs[8] = { 0 }; @@ -345,6 +350,33 @@ static void srat_detect_node(struct cpuinfo_x86 *c) #endif } +static void bsp_determine_snp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_HAS_CC_PLATFORM + cc_vendor = CC_VENDOR_AMD; + + if (cpu_has(c, X86_FEATURE_SEV_SNP)) { + /* + * RMP table entry format is not architectural and is defined by the + * per-processor PPR. Restrict SNP support on the known CPU models + * for which the RMP table entry format is currently defined or for + * processors which support the architecturally defined RMPREAD + * instruction. + */ + if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && + (cpu_feature_enabled(X86_FEATURE_ZEN3) || + cpu_feature_enabled(X86_FEATURE_ZEN4) || + cpu_feature_enabled(X86_FEATURE_RMPREAD)) && + snp_probe_rmptable_info()) { + cc_platform_set(CC_ATTR_HOST_SEV_SNP); + } else { + setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); + cc_platform_clear(CC_ATTR_HOST_SEV_SNP); + } + } +#endif +} + static void bsp_init_amd(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { @@ -353,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) (c->x86 == 0x10 && c->x86_model >= 0x2)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -392,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; @@ -437,12 +469,16 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x1a: switch (c->x86_model) { - case 0x00 ... 0x0f: - case 0x20 ... 0x2f: + case 0x00 ... 0x2f: case 0x40 ... 0x4f: - case 0x70 ... 0x7f: + case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; + case 0x50 ... 0x5f: + case 0x90 ... 0xaf: + case 0xc0 ... 0xcf: + setup_force_cpu_cap(X86_FEATURE_ZEN6); + break; default: goto warn; } @@ -452,21 +488,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) break; } - if (cpu_has(c, X86_FEATURE_SEV_SNP)) { - /* - * RMP table entry format is not architectural and it can vary by processor - * and is defined by the per-processor PPR. Restrict SNP support on the - * known CPU model and family for which the RMP table entry format is - * currently defined for. - */ - if (!boot_cpu_has(X86_FEATURE_ZEN3) && - !boot_cpu_has(X86_FEATURE_ZEN4) && - !boot_cpu_has(X86_FEATURE_ZEN5)) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - else if (!snp_probe_rmptable_info()) - setup_clear_cpu_cap(X86_FEATURE_SEV_SNP); - } - + bsp_determine_snp(c); return; warn: @@ -493,7 +515,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; @@ -510,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (!sme_me_mask) setup_clear_cpu_cap(X86_FEATURE_SME); - rdmsrl(MSR_K7_HWCR, msr); + rdmsrq(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -527,7 +549,6 @@ clear_sev: static void early_init_amd(struct cpuinfo_x86 *c) { - u64 value; u32 dummy; if (c->x86 >= 0xf) @@ -595,24 +616,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_detect_mem_encrypt(c); - /* Re-enable TopologyExtensions if switched off by BIOS */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x6f) && - !cpu_has(c, X86_FEATURE_TOPOEXT)) { - - if (msr_set_bit(0xc0011005, 54) > 0) { - rdmsrl(0xc0011005, value); - if (value & BIT_64(54)) { - set_cpu_cap(c, X86_FEATURE_TOPOEXT); - pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); - } - } - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); setup_force_cpu_cap(X86_FEATURE_SBPB); } @@ -634,16 +641,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { + if (!rdmsrq_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); - wrmsrl_amd_safe(0xc001100d, value); + wrmsrq_amd_safe(0xc001100d, value); } } if (!c->x86_model_id[0]) - strcpy(c->x86_model_id, "Hammer"); + strscpy(c->x86_model_id, "Hammer"); #ifdef CONFIG_SMP /* @@ -788,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(MSR_F15H_IC_CFG, value); + wrmsrq_safe(MSR_F15H_IC_CFG, value); } } @@ -802,6 +809,12 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static const struct x86_cpu_id erratum_1386_microcode[] = { + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e), + X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052), + {} +}; + static void fix_erratum_1386(struct cpuinfo_x86 *c) { /* @@ -811,7 +824,13 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c) * * Affected parts all have no supervisor XSAVE states, meaning that * the XSAVEC instruction (which works fine) is equivalent. + * + * Clear the feature flag only on microcode revisions which + * don't have the fix. */ + if (x86_match_min_microcode_rev(erratum_1386_microcode)) + return; + clear_cpu_cap(c, X86_FEATURE_XSAVES); } @@ -827,9 +846,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * suppresses non-branch predictions. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif @@ -857,6 +876,16 @@ static void init_amd_zen1(struct cpuinfo_x86 *c) pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); + + /* + * Turn off the Instructions Retired free counter on machines that are + * susceptible to erratum #1054 "Instructions Retired Performance + * Counter May Be Inaccurate". + */ + if (c->x86_model < 0x30) { + msr_clear_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); + clear_cpu_cap(c, X86_FEATURE_IRPERF); + } } static bool cpu_has_zenbleed_microcode(void) @@ -920,6 +949,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c) { if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); + + /* + * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE + * in some BIOS versions but they can lead to random host reboots. + */ + switch (c->x86_model) { + case 0x18 ... 0x1f: + case 0x60 ... 0x7f: + clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD); + break; + } } static void init_amd_zen5(struct cpuinfo_x86 *c) @@ -992,7 +1032,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -1029,13 +1069,8 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_feature_enabled(X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); - /* - * Turn on the Instructions Retired free counter on machines not - * susceptible to erratum #1054 "Instructions Retired Performance - * Counter May Be Inaccurate". - */ - if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) + /* Enable the Instructions Retired free counter */ + if (cpu_has(c, X86_FEATURE_IRPERF)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); @@ -1049,10 +1084,14 @@ static void init_amd(struct cpuinfo_x86 *c) */ if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && cpu_has(c, X86_FEATURE_AUTOIBRS)) - WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); + WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0); /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); + + /* Enable Translation Cache Extension */ + if (cpu_has(c, X86_FEATURE_TCE)) + msr_set_bit(MSR_EFER, _EFER_TCE); } #ifdef CONFIG_X86_32 @@ -1085,8 +1124,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1099,26 +1138,30 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; + + /* Max number of pages INVLPGB can invalidate in one shot */ + if (cpu_has(c, X86_FEATURE_INVLPGB)) + invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; } static const struct cpu_dev amd_cpu_dev = { @@ -1170,7 +1213,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) return; - wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + wrmsrq(amd_msr_dr_addr_masks[dr], mask); per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; } @@ -1186,22 +1229,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr) } EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask); -u32 amd_get_highest_perf(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) || - (c->x86_model >= 0x70 && c->x86_model < 0x80))) - return 166; - - if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) || - (c->x86_model >= 0x40 && c->x86_model < 0x70))) - return 166; - - return 255; -} -EXPORT_SYMBOL_GPL(amd_get_highest_perf); - static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); @@ -1214,16 +1241,59 @@ void amd_check_microcode(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return; - on_each_cpu(zenbleed_check_cpu, NULL, 1); + if (cpu_feature_enabled(X86_FEATURE_ZEN2)) + on_each_cpu(zenbleed_check_cpu, NULL, 1); } -/* - * Issue a DIV 0/1 insn to clear any division data from previous DIV - * operations. - */ -void noinstr amd_clear_divider(void) +static const char * const s5_reset_reason_txt[] = { + [0] = "thermal pin BP_THERMTRIP_L was tripped", + [1] = "power button was pressed for 4 seconds", + [2] = "shutdown pin was tripped", + [4] = "remote ASF power off command was received", + [9] = "internal CPU thermal limit was tripped", + [16] = "system reset pin BP_SYS_RST_L was tripped", + [17] = "software issued PCI reset", + [18] = "software wrote 0x4 to reset control register 0xCF9", + [19] = "software wrote 0x6 to reset control register 0xCF9", + [20] = "software wrote 0xE to reset control register 0xCF9", + [21] = "ACPI power state transition occurred", + [22] = "keyboard reset pin KB_RST_L was tripped", + [23] = "internal CPU shutdown event occurred", + [24] = "system failed to boot before failed boot timer expired", + [25] = "hardware watchdog timer expired", + [26] = "remote ASF reset command was received", + [27] = "an uncorrected error caused a data fabric sync flood event", + [29] = "FCH and MP1 failed warm reset handshake", + [30] = "a parity error occurred", + [31] = "a software sync flood event occurred", +}; + +static __init int print_s5_reset_status_mmio(void) { - asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0) - :: "a" (0), "d" (0), "r" (1)); + unsigned long value; + void __iomem *addr; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value)); + if (!addr) + return 0; + + value = ioread32(addr); + iounmap(addr); + + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { + if (!(value & BIT(i))) + continue; + + if (s5_reset_reason_txt[i]) { + pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n", + value, s5_reset_reason_txt[i]); + } + } + + return 0; } -EXPORT_SYMBOL_GPL(amd_clear_divider); +late_initcall(print_s5_reset_status_mmio); diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c new file mode 100644 index 000000000000..8843b9557aea --- /dev/null +++ b/arch/x86/kernel/cpu/amd_cache_disable.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD L3 cache_disable_{0,1} sysfs handling + * Documentation/ABI/testing/sysfs-devices-system-cpu + */ + +#include <linux/cacheinfo.h> +#include <linux/capability.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <asm/amd/nb.h> + +#include "cpu.h" + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot) +{ + int index; + struct amd_northbridge *nb = ci->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sysfs_emit(buf, "%d\n", index); + + return sysfs_emit(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return show_cache_disable(ci, buf, slot); \ +} + +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf, + size_t count, unsigned int slot) +{ + struct amd_northbridge *nb = ci->priv; + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&ci->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return store_cache_disable(ci, buf, count, slot); \ +} + +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + + return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *ci = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!ci->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + static struct attribute **amd_l3_attrs; + int n = 1; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci) +{ + struct amd_northbridge *nb = ci->priv; + + if (ci->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +struct amd_northbridge *amd_init_l3_cache(int index) +{ + struct amd_northbridge *nb; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return NULL; + + node = topology_amd_node_id(smp_processor_id()); + nb = node_to_amd_nb(node); + if (nb && !nb->l3_cache.indices) + amd_calc_l3_indices(nb); + + return nb; +} diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index fdbb5f07448f..a315b0627dfb 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -20,6 +20,7 @@ #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "cpu.h" @@ -40,8 +41,8 @@ static void init_counter_refs(void) { u64 aperf, mperf; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); @@ -99,7 +100,7 @@ static bool __init turbo_disabled(void) u64 misc_en; int err; - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; @@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; @@ -124,25 +125,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -#define X86_MATCH(model) \ - X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) +#define X86_MATCH(vfm) \ + X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL) static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { - X86_MATCH(XEON_PHI_KNL), - X86_MATCH(XEON_PHI_KNM), + X86_MATCH(INTEL_XEON_PHI_KNL), + X86_MATCH(INTEL_XEON_PHI_KNM), {} }; static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { - X86_MATCH(SKYLAKE_X), + X86_MATCH(INTEL_SKYLAKE_X), {} }; static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { - X86_MATCH(ATOM_GOLDMONT), - X86_MATCH(ATOM_GOLDMONT_D), - X86_MATCH(ATOM_GOLDMONT_PLUS), + X86_MATCH(INTEL_ATOM_GOLDMONT), + X86_MATCH(INTEL_ATOM_GOLDMONT_D), + X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS), {} }; @@ -153,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -191,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s u32 group_size; int err, i; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; @@ -221,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) u64 msr; int err; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -307,7 +307,7 @@ static void freq_invariance_enable(void) WARN_ON_ONCE(1); return; } - static_branch_enable(&arch_scale_freq_key); + static_branch_enable_cpuslocked(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); } @@ -324,8 +324,10 @@ static void __init bp_init_freq_invariance(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; - if (intel_set_max_freq_ratio()) + if (intel_set_max_freq_ratio()) { + guard(cpus_read_lock)(); freq_invariance_enable(); + } } static void disable_freq_invariance_workfn(struct work_struct *work) @@ -346,10 +348,91 @@ static DECLARE_WORK(disable_freq_invariance_work, disable_freq_invariance_workfn); DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; +EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale); + +static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key); + +struct arch_hybrid_cpu_scale { + unsigned long capacity; + unsigned long freq_ratio; +}; + +static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale; + +/** + * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling + * + * Allocate memory for per-CPU data used by hybrid CPU capacity scaling, + * initialize it and set the static key controlling its code paths. + * + * Must be called before arch_set_cpu_capacity(). + */ +bool arch_enable_hybrid_capacity_scale(void) +{ + int cpu; + + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) { + WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled"); + return true; + } + + arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale); + if (!arch_cpu_scale) + return false; + + for_each_possible_cpu(cpu) { + per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE; + per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio; + } + + static_branch_enable(&arch_hybrid_cap_scale_key); + + pr_info("Hybrid CPU capacity scaling enabled\n"); + + return true; +} + +/** + * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU + * @cpu: Target CPU. + * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap. + * @max_cap: System-wide maximum CPU capacity. + * @cap_freq: Frequency of @cpu corresponding to @cap. + * @base_freq: Frequency of @cpu at which MPERF counts. + * + * The units in which @cap and @max_cap are expressed do not matter, so long + * as they are consistent, because the former is effectively divided by the + * latter. Analogously for @cap_freq and @base_freq. + * + * After calling this function for all CPUs, call arch_rebuild_sched_domains() + * to let the scheduler know that capacity-aware scheduling can be used going + * forward. + */ +void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap, + unsigned long cap_freq, unsigned long base_freq) +{ + if (static_branch_likely(&arch_hybrid_cap_scale_key)) { + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity, + div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap)); + WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio, + div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq)); + } else { + WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled"); + } +} + +unsigned long arch_scale_cpu_capacity(int cpu) +{ + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity); + + return SCHED_CAPACITY_SCALE; +} +EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity); static void scale_freq_tick(u64 acnt, u64 mcnt) { - u64 freq_scale; + u64 freq_scale, freq_ratio; if (!arch_scale_freq_invariant()) return; @@ -357,7 +440,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; - if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) + freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio); + else + freq_ratio = arch_max_freq_ratio; + + if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt) goto error; freq_scale = div64_u64(acnt, mcnt); @@ -387,8 +475,8 @@ void arch_scale_freq_tick(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; @@ -411,7 +499,7 @@ void arch_scale_freq_tick(void) */ #define MAX_SAMPLE_AGE ((unsigned long)HZ / 50) -unsigned int arch_freq_get_on_cpu(int cpu) +int arch_freq_get_on_cpu(int cpu) { struct aperfmperf *s = per_cpu_ptr(&cpu_samples, cpu); unsigned int seq, freq; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index e7ba936d798b..7f94e6a5497d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -26,7 +26,7 @@ #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> @@ -34,21 +34,66 @@ #include "cpu.h" +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ + static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v1_apply_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init spectre_v2_update_mitigation(void); +static void __init spectre_v2_apply_mitigation(void); static void __init retbleed_select_mitigation(void); +static void __init retbleed_update_mitigation(void); +static void __init retbleed_apply_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); +static void __init spectre_v2_user_update_mitigation(void); +static void __init spectre_v2_user_apply_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init ssb_apply_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init l1tf_apply_mitigation(void); static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); +static void __init mds_update_mitigation(void); +static void __init mds_apply_mitigation(void); static void __init taa_select_mitigation(void); +static void __init taa_update_mitigation(void); +static void __init taa_apply_mitigation(void); static void __init mmio_select_mitigation(void); +static void __init mmio_update_mitigation(void); +static void __init mmio_apply_mitigation(void); +static void __init rfds_select_mitigation(void); +static void __init rfds_update_mitigation(void); +static void __init rfds_apply_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init srbds_apply_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init srso_update_mitigation(void); +static void __init srso_apply_mitigation(void); static void __init gds_select_mitigation(void); +static void __init gds_apply_mitigation(void); +static void __init bhi_select_mitigation(void); +static void __init bhi_update_mitigation(void); +static void __init bhi_apply_mitigation(void); +static void __init its_select_mitigation(void); +static void __init its_update_mitigation(void); +static void __init its_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -59,17 +104,26 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); + +static u64 __ro_after_init x86_arch_cap_msr; static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } /* @@ -88,7 +142,7 @@ void update_spec_ctrl_cond(u64 val) * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) @@ -111,6 +165,10 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); +/* Control IBPB on vCPU load */ +DEFINE_STATIC_KEY_FALSE(switch_vcpu_ibpb); +EXPORT_SYMBOL_GPL(switch_vcpu_ibpb); + /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); @@ -122,9 +180,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +/* + * Controls CPU Fill buffer clear before VMenter. This is a subset of + * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * mitigation is required. + */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); +EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); void __init cpu_select_mitigations(void) { @@ -134,7 +196,7 @@ void __init cpu_select_mitigations(void) * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls @@ -144,33 +206,72 @@ void __init cpu_select_mitigations(void) x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } + x86_arch_cap_msr = x86_read_arch_cap_msr(); + /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - md_clear_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). + * After mitigations are selected, some may need to update their + * choices. */ - srso_select_mitigation(); - gds_select_mitigation(); + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); } /* @@ -220,16 +321,17 @@ static void x86_amd_ssb_disable(void) u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - wrmsrl(MSR_AMD64_LS_CFG, msrval); + wrmsrq(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ -static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; +static enum mds_mitigations mds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_AUTO : MDS_MITIGATION_OFF; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { @@ -238,6 +340,46 @@ static const char * const mds_strings[] = { [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_AUTO, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_AUTO : TAA_MITIGATION_OFF; + +enum mmio_mitigations { + MMIO_MITIGATION_OFF, + MMIO_MITIGATION_AUTO, + MMIO_MITIGATION_UCODE_NEEDED, + MMIO_MITIGATION_VERW, +}; + +/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ +static enum mmio_mitigations mmio_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_AUTO : MMIO_MITIGATION_OFF; + +enum rfds_mitigations { + RFDS_MITIGATION_OFF, + RFDS_MITIGATION_AUTO, + RFDS_MITIGATION_VERW, + RFDS_MITIGATION_UCODE_NEEDED, +}; + +/* Default mitigation for Register File Data Sampling */ +static enum rfds_mitigations rfds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; + +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing + * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -245,12 +387,37 @@ static void __init mds_select_mitigation(void) return; } + if (mds_mitigation == MDS_MITIGATION_AUTO) + mds_mitigation = MDS_MITIGATION_FULL; + + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); @@ -281,15 +448,6 @@ early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt -enum taa_mitigations { - TAA_MITIGATION_OFF, - TAA_MITIGATION_UCODE_NEEDED, - TAA_MITIGATION_VERW, - TAA_MITIGATION_TSX_DISABLED, -}; - -/* Default mitigation for TAA-affected CPUs */ -static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { @@ -299,10 +457,13 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; -static void __init taa_select_mitigation(void) +static bool __init taa_vulnerable(void) { - u64 ia32_cap; + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} +static void __init taa_select_mitigation(void) +{ if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; @@ -314,49 +475,63 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { + if (cpu_mitigations_off()) taa_mitigation = TAA_MITIGATION_OFF; - return; - } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) + taa_mitigation = TAA_MITIGATION_VERW; + + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable() || cpu_mitigations_off()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - ia32_cap = x86_read_arch_cap_msr(); - if ( (ia32_cap & ARCH_CAP_MDS_NO) && - !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -383,14 +558,6 @@ early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt -enum mmio_mitigations { - MMIO_MITIGATION_OFF, - MMIO_MITIGATION_UCODE_NEEDED, - MMIO_MITIGATION_VERW, -}; - -/* Default mitigation for Processor MMIO Stale Data vulnerabilities */ -static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { @@ -401,60 +568,77 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { - u64 ia32_cap; - if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) + mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; - ia32_cap = x86_read_arch_cap_msr(); - /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + static_branch_disable(&cpu_buf_vm_clear); + } else { + static_branch_enable(&cpu_buf_vm_clear); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ - if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) + if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((ia32_cap & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(ia32_cap & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } @@ -483,35 +667,54 @@ early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Register File Data Sampling: " fmt -enum rfds_mitigations { - RFDS_MITIGATION_OFF, - RFDS_MITIGATION_VERW, - RFDS_MITIGATION_UCODE_NEEDED, -}; - -/* Default mitigation for Register File Data Sampling */ -static enum rfds_mitigations rfds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF; - static const char * const rfds_strings[] = { [RFDS_MITIGATION_OFF] = "Vulnerable", [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File", [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (x86_read_arch_cap_msr() & ARCH_CAP_RFDS_CLEAR) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + rfds_mitigation = RFDS_MITIGATION_VERW; + + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) @@ -532,83 +735,19 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - pr_info("MMIO Stale Data: Unknown: No mitigations\n"); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, SRBDS_MITIGATION_HYPERVISOR, }; -static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; +static enum srbds_mitigations srbds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -640,7 +779,7 @@ void update_srbds_msr(void) if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: @@ -654,36 +793,42 @@ void update_srbds_msr(void) break; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { - u64 ia32_cap; - - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) + srbds_mitigation = SRBDS_MITIGATION_FULL; /* * Check to see if this is one of the MDS_NO systems supporting TSX that * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ - ia32_cap = x86_read_arch_cap_msr(); - if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -730,6 +875,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -737,11 +883,8 @@ enum gds_mitigations { GDS_MITIGATION_HYPERVISOR, }; -#if IS_ENABLED(CONFIG_MITIGATION_GDS_FORCE) -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; -#else -static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; -#endif +static enum gds_mitigations gds_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -766,7 +909,7 @@ void update_gds_msr(void) switch (gds_mitigation) { case GDS_MITIGATION_OFF: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: @@ -776,23 +919,24 @@ void update_gds_msr(void) * CPUs. */ case GDS_MITIGATION_FULL: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } @@ -805,33 +949,28 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) + gds_mitigation = GDS_MITIGATION_FULL; + /* No microcode */ - if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); @@ -845,9 +984,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -877,7 +1032,8 @@ enum spectre_v1_mitigation { }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = - SPECTRE_V1_MITIGATION_AUTO; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ? + SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", @@ -907,10 +1063,14 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -963,8 +1123,20 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_AUTO, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static enum its_mitigation its_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF; + enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -972,14 +1144,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -990,9 +1154,7 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - RETBLEED_CMD_AUTO; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; @@ -1009,15 +1171,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1038,77 +1200,122 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} + +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) + goto out; + + /* + * retbleed=stuff is only allowed on Intel. If stuffing can't be used + * then a different mitigation will be selected below. + * + * its=stuff will also attempt to enable stuffing. + */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || + its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { + if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_info("Retbleed mitigation updated to stuffing\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } - break; - -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } + /* If nothing has set the mitigation yet, default to NONE. */ + if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ - break; - } +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1121,13 +1328,29 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; + + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like SRSO has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + + /* + * There is no need for RSB filling: write_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1137,28 +1360,131 @@ do_cmd_auto: if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); +} + +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_mitigation = ITS_MITIGATION_OFF; + } else if (!strcmp(str, "on")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } else if (!strcmp(str, "force")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + } else if (!strcmp(str, "stuff")) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_AUTO) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + + if (its_mitigation == ITS_MITIGATION_OFF) + return; + + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("RSB stuff mitigation not supported, using default\n"); + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } + + if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY && + !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; +} + +static void __init its_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) + return; + + switch (spectre_v2_enabled) { + case SPECTRE_V2_NONE: + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + case SPECTRE_V2_RETPOLINE: + /* Retpoline+CDT mitigates ITS */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + break; + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + default: + break; + } /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing + * retbleed_update_mitigation() will try to do stuffing if its=stuff. + * If it can't, such as if spectre_v2!=retpoline, then fall back to + * aligned thunks. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); - } - } + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); + pr_info("%s\n", its_strings[its_mitigation]); +} + +static void __init its_apply_mitigation(void) +{ + /* its=stuff forces retbleed stuffing and is enabled there. */ + if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) + return; + + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); } #undef pr_fmt @@ -1238,6 +1564,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, @@ -1276,22 +1604,13 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: + if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); @@ -1306,7 +1625,7 @@ spectre_v2_parse_user_cmdline(void) } } - pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); + pr_err("Unknown user space protection option (%s). Switching to default\n", arg); return SPECTRE_V2_USER_CMD_AUTO; } @@ -1315,65 +1634,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - bool smt_possible = IS_ENABLED(CONFIG_SMP); - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || - cpu_smt_control == CPU_SMT_NOT_SUPPORTED) - smt_possible = false; - - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_parse_user_cmdline()) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - setup_force_cpu_cap(X86_FEATURE_USE_IBPB); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1389,32 +1715,46 @@ spectre_v2_user_select_mitigation(void) * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || - !smt_possible || + !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} - spectre_v2_user_stibp = mode; +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } + + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { @@ -1453,17 +1793,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure) static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { - enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) @@ -1473,8 +1814,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) } if (i >= ARRAY_SIZE(mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || @@ -1544,140 +1885,248 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) return SPECTRE_V2_RETPOLINE; } +static bool __ro_after_init rrsba_disabled; + /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { - u64 ia32_cap; + if (rrsba_disabled) + return; - if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) { + rrsba_disabled = true; return; + } - ia32_cap = x86_read_arch_cap_msr(); + if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) + return; - if (ia32_cap & ARCH_CAP_RRSBA) { - x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; - update_spec_ctrl(x86_spec_ctrl_base); - } + x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + rrsba_disabled = true; } -static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: - return; + break; - case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS_RETPOLINE: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); } - return; + break; - case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: + pr_info("Spectre v2 / SpectreRSB: Filling RSB on context switch and VMEXIT\n"); + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); - pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + break; + + default: + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation\n"); + dump_stack(); + break; + } +} + +/* + * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by + * branch history in userspace. Not needed if BHI_NO is set. + */ +static bool __init spec_ctrl_bhi_dis(void) +{ + if (!boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S; + update_spec_ctrl(x86_spec_ctrl_base); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW); + + return true; +} + +enum bhi_mitigations { + BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, + BHI_MITIGATION_ON, + BHI_MITIGATION_VMEXIT_ONLY, +}; + +static enum bhi_mitigations bhi_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; + +static int __init spectre_bhi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + bhi_mitigation = BHI_MITIGATION_OFF; + else if (!strcmp(str, "on")) + bhi_mitigation = BHI_MITIGATION_ON; + else if (!strcmp(str, "vmexit")) + bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY; + else + pr_err("Ignoring unknown spectre_bhi option (%s)", str); + + return 0; +} +early_param("spectre_bhi", spectre_bhi_parse_cmdline); + +static void __init bhi_select_mitigation(void) +{ + if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + bhi_mitigation = BHI_MITIGATION_ON; +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ + if (bhi_mitigation == BHI_MITIGATION_OFF) return; + + /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */ + if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + spec_ctrl_disable_kernel_rrsba(); + if (rrsba_disabled) + return; } - pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); - dump_stack(); + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) + return; + + if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { + pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); + return; + } + + pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); } static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + spectre_v2_cmd = spectre_v2_parse_cmdline(); - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) return; - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; - break; - } - - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} + +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO && + !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} + +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1686,8 +2135,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1713,56 +2164,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ - setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); - pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - - spectre_v2_determine_rsb_fill_type_at_vmexit(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1770,28 +2177,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -1832,8 +2237,6 @@ static void update_indir_branch_cond(void) /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { - u64 ia32_cap = x86_read_arch_cap_msr(); - /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. @@ -1848,7 +2251,7 @@ static void update_mds_branch_idle(void) if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || - (ia32_cap & ARCH_CAP_FBSDP_NO)) { + (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } @@ -1880,6 +2283,7 @@ void cpu_bugs_smt_update(void) switch (mds_mitigation) { case MDS_MITIGATION_FULL: + case MDS_MITIGATION_AUTO: case MDS_MITIGATION_VMWERV: if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) pr_warn_once(MDS_MSG_SMT); @@ -1891,6 +2295,7 @@ void cpu_bugs_smt_update(void) switch (taa_mitigation) { case TAA_MITIGATION_VERW: + case TAA_MITIGATION_AUTO: case TAA_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(TAA_MSG_SMT); @@ -1902,6 +2307,7 @@ void cpu_bugs_smt_update(void) switch (mmio_mitigation) { case MMIO_MITIGATION_VERW: + case MMIO_MITIGATION_AUTO: case MMIO_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(MMIO_MSG_SMT); @@ -1947,10 +2353,12 @@ static const struct { static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { - enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + enum ssb_mitigation_cmd cmd; char arg[20]; int ret, i; + cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ? + SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; @@ -1958,7 +2366,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) - return SPEC_STORE_BYPASS_CMD_AUTO; + return cmd; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) @@ -1969,27 +2377,26 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { - pr_err("unknown option (%s). Switching to AUTO select\n", arg); - return SPEC_STORE_BYPASS_CMD_AUTO; + pr_err("unknown option (%s). Switching to default mode\n", arg); + return cmd; } } return cmd; } -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; + goto out; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + return; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: @@ -1998,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + ssb_mode = SPEC_STORE_BYPASS_SECCOMP; else - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; + ssb_mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } +out: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2033,16 +2447,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2297,7 +2701,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ -enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; +enum l1tf_mitigations l1tf_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2323,20 +2728,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c) if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_SANDYBRIDGE: - case INTEL_FAM6_IVYBRIDGE: - case INTEL_FAM6_HASWELL: - case INTEL_FAM6_HASWELL_L: - case INTEL_FAM6_HASWELL_G: - case INTEL_FAM6_BROADWELL: - case INTEL_FAM6_BROADWELL_G: - case INTEL_FAM6_SKYLAKE_L: - case INTEL_FAM6_SKYLAKE: - case INTEL_FAM6_KABYLAKE_L: - case INTEL_FAM6_KABYLAKE: + switch (c->x86_vfm) { + case INTEL_NEHALEM: + case INTEL_WESTMERE: + case INTEL_SANDYBRIDGE: + case INTEL_IVYBRIDGE: + case INTEL_HASWELL: + case INTEL_HASWELL_L: + case INTEL_HASWELL_G: + case INTEL_BROADWELL: + case INTEL_BROADWELL_G: + case INTEL_SKYLAKE_L: + case INTEL_SKYLAKE: + case INTEL_KABYLAKE_L: + case INTEL_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; @@ -2345,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } + + if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { + if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + } +} + +static void __init l1tf_apply_mitigation(void) +{ u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; - if (cpu_mitigations_off()) - l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2420,20 +2836,14 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, SRSO_MITIGATION_IBPB_ON_VMEXIT, -}; - -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, + SRSO_MITIGATION_BP_SPEC_REDUCE, }; static const char * const srso_strings[] = { @@ -2443,11 +2853,11 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", - [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" + [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only", + [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; static int __init srso_parse_cmdline(char *str) { @@ -2455,15 +2865,15 @@ static int __init srso_parse_cmdline(char *str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2475,104 +2885,138 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + bool has_microcode; - if (cpu_mitigations_off()) - return; + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + srso_mitigation = SRSO_MITIGATION_NONE; - if (!boot_cpu_has_bug(X86_BUG_SRSO)) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; + if (srso_mitigation == SRSO_MITIGATION_NONE) return; - } + if (srso_mitigation == SRSO_MITIGATION_AUTO) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); + srso_mitigation = SRSO_MITIGATION_NONE; return; } - - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; - } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_OFF: - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - return; - - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; + goto ibpb_on_vmexit; } - break; - case SRSO_CMD_SAFE_RET: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } - break; - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - srso_mitigation = SRSO_MITIGATION_IBPB; - } - } else { + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; + break; +ibpb_on_vmexit: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { + pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); + srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; + break; + } + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } + + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; + break; + default: break; + } +} - case SRSO_CMD_IBPB_ON_VMEXIT: - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; - } +static void __init srso_update_mitigation(void) +{ + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + if (boot_cpu_has_bug(X86_BUG_SRSO) && + !cpu_mitigations_off() && + !boot_cpu_has(X86_FEATURE_SRSO_NO)) + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ + /* + * Clear the feature flag if this mitigation is not selected as that + * feature flag controls the BpSpecReduce MSR bit toggling in KVM. + */ + if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) + setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); + + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + set_return_thunk(srso_alias_return_thunk); } else { - pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); - } + setup_force_cpu_cap(X86_FEATURE_SRSO); + set_return_thunk(srso_return_thunk); + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; + default: break; } - -out: - pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt @@ -2667,9 +3111,6 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return sysfs_emit(buf, "Unknown: No mitigations\n"); - if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2687,6 +3128,19 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t old_microcode_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return sysfs_emit(buf, "Unknown: running under hypervisor"); + + return sysfs_emit(buf, "Vulnerable\n"); +} + +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2695,15 +3149,15 @@ static char *stibp_state(void) switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: - return ", STIBP: disabled"; + return "; STIBP: disabled"; case SPECTRE_V2_USER_STRICT: - return ", STIBP: forced"; + return "; STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: - return ", STIBP: always-on"; + return "; STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) - return ", STIBP: conditional"; + return "; STIBP: conditional"; } return ""; } @@ -2712,10 +3166,10 @@ static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) - return ", IBPB: always-on"; + return "; IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) - return ", IBPB: conditional"; - return ", IBPB: disabled"; + return "; IBPB: conditional"; + return "; IBPB: disabled"; } return ""; } @@ -2725,14 +3179,32 @@ static char *pbrsb_eibrs_state(void) if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) - return ", PBRSB-eIBRS: SW sequence"; + return "; PBRSB-eIBRS: SW sequence"; else - return ", PBRSB-eIBRS: Vulnerable"; + return "; PBRSB-eIBRS: Vulnerable"; } else { - return ", PBRSB-eIBRS: Not affected"; + return "; PBRSB-eIBRS: Not affected"; } } +static const char *spectre_bhi_state(void) +{ + if (!boot_cpu_has_bug(X86_BUG_BHI)) + return "; BHI: Not affected"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW)) + return "; BHI: BHI_DIS_S"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP)) + return "; BHI: SW loop, KVM: SW loop"; + else if (boot_cpu_has(X86_FEATURE_RETPOLINE) && + !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && + rrsba_disabled) + return "; BHI: Retpoline"; + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT)) + return "; BHI: Vulnerable, KVM: SW loop"; + + return "; BHI: Vulnerable"; +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2745,13 +3217,15 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", + return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "", stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "", pbrsb_eibrs_state(), + spectre_bhi_state(), + /* this should always be at the end */ spectre_v2_module_string()); } @@ -2834,7 +3308,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: - case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -2849,6 +3322,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_OLD_MICROCODE: + return old_microcode_show_state(buf); + + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -2903,10 +3382,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); - else - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) @@ -2928,6 +3404,16 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE); +} + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c new file mode 100644 index 000000000000..981f8b1f0792 --- /dev/null +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -0,0 +1,432 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "x86/split lock detection: " fmt + +#include <linux/semaphore.h> +#include <linux/workqueue.h> +#include <linux/delay.h> +#include <linux/cpuhotplug.h> +#include <asm/cpu_device_id.h> +#include <asm/cmdline.h> +#include <asm/traps.h> +#include <asm/cpu.h> +#include <asm/msr.h> + +enum split_lock_detect_state { + sld_off = 0, + sld_warn, + sld_fatal, + sld_ratelimit, +}; + +/* + * Default to sld_off because most systems do not support split lock detection. + * sld_state_setup() will switch this to sld_warn on systems that support + * split lock/bus lock detect, unless there is a command line override. + */ +static enum split_lock_detect_state sld_state __ro_after_init = sld_off; +static u64 msr_test_ctrl_cache __ro_after_init; + +/* + * With a name like MSR_TEST_CTL it should go without saying, but don't touch + * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it + * on CPUs that do not support SLD can cause fireworks, even when writing '0'. + */ +static bool cpu_model_supports_sld __ro_after_init; + +static const struct { + const char *option; + enum split_lock_detect_state state; +} sld_options[] __initconst = { + { "off", sld_off }, + { "warn", sld_warn }, + { "fatal", sld_fatal }, + { "ratelimit:", sld_ratelimit }, +}; + +static struct ratelimit_state bld_ratelimit; + +static unsigned int sysctl_sld_mitigate = 1; +static DEFINE_SEMAPHORE(buslock_sem, 1); + +#ifdef CONFIG_PROC_SYSCTL +static const struct ctl_table sld_sysctls[] = { + { + .procname = "split_lock_mitigate", + .data = &sysctl_sld_mitigate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init sld_mitigate_sysctl_init(void) +{ + register_sysctl_init("kernel", sld_sysctls); + return 0; +} + +late_initcall(sld_mitigate_sysctl_init); +#endif + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt), ratelimit; + + if (strncmp(arg, opt, len)) + return false; + + /* + * Min ratelimit is 1 bus lock/sec. + * Max ratelimit is 1000 bus locks/sec. + */ + if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && + ratelimit > 0 && ratelimit <= 1000) { + ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); + ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); + return true; + } + + return len == arglen; +} + +static bool split_lock_verify_msr(bool on) +{ + u64 ctrl, tmp; + + if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl)) + return false; + if (on) + ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + else + ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + if (wrmsrq_safe(MSR_TEST_CTRL, ctrl)) + return false; + rdmsrq(MSR_TEST_CTRL, tmp); + return ctrl == tmp; +} + +static void __init sld_state_setup(void) +{ + enum split_lock_detect_state state = sld_warn; + char arg[20]; + int i, ret; + + if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + ret = cmdline_find_option(boot_command_line, "split_lock_detect", + arg, sizeof(arg)); + if (ret >= 0) { + for (i = 0; i < ARRAY_SIZE(sld_options); i++) { + if (match_option(arg, ret, sld_options[i].option)) { + state = sld_options[i].state; + break; + } + } + } + sld_state = state; +} + +static void __init __split_lock_setup(void) +{ + if (!split_lock_verify_msr(false)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); + + if (!split_lock_verify_msr(true)) { + pr_info("MSR access failed: Disabled\n"); + return; + } + + /* Restore the MSR to its cached value. */ + wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); + + setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); +} + +/* + * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking + * is not implemented as one thread could undo the setting of the other + * thread immediately after dropping the lock anyway. + */ +static void sld_update_msr(bool on) +{ + u64 test_ctrl_val = msr_test_ctrl_cache; + + if (on) + test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; + + wrmsrq(MSR_TEST_CTRL, test_ctrl_val); +} + +void split_lock_init(void) +{ + /* + * #DB for bus lock handles ratelimit and #AC for split lock is + * disabled. + */ + if (sld_state == sld_ratelimit) { + split_lock_verify_msr(false); + return; + } + + if (cpu_model_supports_sld) + split_lock_verify_msr(sld_state != sld_off); +} + +static void __split_lock_reenable_unlock(struct work_struct *work) +{ + sld_update_msr(true); + up(&buslock_sem); +} + +static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); + +static void __split_lock_reenable(struct work_struct *work) +{ + sld_update_msr(true); +} +/* + * In order for each CPU to schedule its delayed work independently of the + * others, delayed work struct must be per-CPU. This is not required when + * sysctl_sld_mitigate is enabled because of the semaphore that limits + * the number of simultaneously scheduled delayed works to 1. + */ +static DEFINE_PER_CPU(struct delayed_work, sl_reenable); + +/* + * Per-CPU delayed_work can't be statically initialized properly because + * the struct address is unknown. Thus per-CPU delayed_work structures + * have to be initialized during kernel initialization and after calling + * setup_per_cpu_areas(). + */ +static int __init setup_split_lock_delayed_work(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + struct delayed_work *work = per_cpu_ptr(&sl_reenable, cpu); + + INIT_DELAYED_WORK(work, __split_lock_reenable); + } + + return 0; +} +pure_initcall(setup_split_lock_delayed_work); + +/* + * If a CPU goes offline with pending delayed work to re-enable split lock + * detection then the delayed work will be executed on some other CPU. That + * handles releasing the buslock_sem, but because it executes on a + * different CPU probably won't re-enable split lock detection. This is a + * problem on HT systems since the sibling CPU on the same core may then be + * left running with split lock detection disabled. + * + * Unconditionally re-enable detection here. + */ +static int splitlock_cpu_offline(unsigned int cpu) +{ + sld_update_msr(true); + + return 0; +} + +static void split_lock_warn(unsigned long ip) +{ + struct delayed_work *work; + int cpu; + unsigned int saved_sld_mitigate = READ_ONCE(sysctl_sld_mitigate); + + if (!current->reported_split_lock) + pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", + current->comm, current->pid, ip); + current->reported_split_lock = 1; + + if (saved_sld_mitigate) { + /* + * misery factor #1: + * sleep 10ms before trying to execute split lock. + */ + if (msleep_interruptible(10) > 0) + return; + /* + * Misery factor #2: + * only allow one buslocked disabled core at a time. + */ + if (down_interruptible(&buslock_sem) == -EINTR) + return; + } + + cpu = get_cpu(); + work = saved_sld_mitigate ? &sl_reenable_unlock : per_cpu_ptr(&sl_reenable, cpu); + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ + sld_update_msr(false); + put_cpu(); +} + +bool handle_guest_split_lock(unsigned long ip) +{ + if (sld_state == sld_warn) { + split_lock_warn(ip); + return true; + } + + pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", + current->comm, current->pid, + sld_state == sld_fatal ? "fatal" : "bogus", ip); + + current->thread.error_code = 0; + current->thread.trap_nr = X86_TRAP_AC; + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + return false; +} +EXPORT_SYMBOL_GPL(handle_guest_split_lock); + +void bus_lock_init(void) +{ + u64 val; + + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + return; + + rdmsrq(MSR_IA32_DEBUGCTLMSR, val); + + if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && + (sld_state == sld_warn || sld_state == sld_fatal)) || + sld_state == sld_off) { + /* + * Warn and fatal are handled by #AC for split lock if #AC for + * split lock is supported. + */ + val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; + } else { + val |= DEBUGCTLMSR_BUS_LOCK_DETECT; + } + + wrmsrq(MSR_IA32_DEBUGCTLMSR, val); +} + +bool handle_user_split_lock(struct pt_regs *regs, long error_code) +{ + if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) + return false; + split_lock_warn(regs->ip); + return true; +} + +void handle_bus_lock(struct pt_regs *regs) +{ + switch (sld_state) { + case sld_off: + break; + case sld_ratelimit: + /* Enforce no more than bld_ratelimit bus locks/sec. */ + while (!__ratelimit(&bld_ratelimit)) + msleep(20); + /* Warn on the bus lock. */ + fallthrough; + case sld_warn: + pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", + current->comm, current->pid, regs->ip); + break; + case sld_fatal: + force_sig_fault(SIGBUS, BUS_ADRALN, NULL); + break; + } +} + +/* + * CPU models that are known to have the per-core split-lock detection + * feature even though they do not enumerate IA32_CORE_CAPABILITIES. + */ +static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + {} +}; + +static void __init split_lock_setup(struct cpuinfo_x86 *c) +{ + const struct x86_cpu_id *m; + u64 ia32_core_caps; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + /* Check for CPUs that have support but do not enumerate it: */ + m = x86_match_cpu(split_lock_cpu_ids); + if (m) + goto supported; + + if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) + return; + + /* + * Not all bits in MSR_IA32_CORE_CAPS are architectural, but + * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set + * it have split lock detection. + */ + rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps); + if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) + goto supported; + + /* CPU is not in the model list and does not have the MSR bit: */ + return; + +supported: + cpu_model_supports_sld = true; + __split_lock_setup(); +} + +static void sld_state_show(void) +{ + if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && + !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + return; + + switch (sld_state) { + case sld_off: + pr_info("disabled\n"); + break; + case sld_warn: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/splitlock", NULL, splitlock_cpu_offline) < 0) + pr_warn("No splitlock CPU offline handler\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: warning on user-space bus_locks\n"); + } + break; + case sld_fatal: + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { + pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); + } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { + pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", + boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? + " from non-WB" : ""); + } + break; + case sld_ratelimit: + if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) + pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); + break; + } +} + +void __init sld_setup(struct cpuinfo_x86 *c) +{ + split_lock_setup(c); + sld_state_setup(); + sld_state_show(); +} diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 392d09c936d6..adfa7e8bb865 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -1,38 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Routines to identify caches on Intel CPU. + * x86 CPU caches detection and configuration * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + * Previous changes + * - Venkatesh Pallipadi: Cache identification through CPUID(0x4) + * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure + * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD */ -#include <linux/slab.h> #include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/sched.h> -#include <linux/capability.h> -#include <linux/sysfs.h> -#include <linux/pci.h> #include <linux/stop_machine.h> -#include <asm/cpufeature.h> +#include <asm/amd/nb.h> #include <asm/cacheinfo.h> -#include <asm/amd_nb.h> -#include <asm/smp.h> +#include <asm/cpufeature.h> +#include <asm/cpuid/api.h> #include <asm/mtrr.h> +#include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 -#define LVL_TRACE 5 - /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -44,214 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask; /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; + enum _cache_type type :5; + unsigned int level :3; + unsigned int is_self_initializing :1; + unsigned int is_fully_associative :1; + unsigned int reserved :4; + unsigned int num_threads_sharing :12; + unsigned int num_cores_on_die :6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; + unsigned int coherency_line_size :12; + unsigned int physical_line_partition :10; + unsigned int ways_of_associativity :10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { - unsigned int number_of_sets:32; + unsigned int number_of_sets :32; } split; u32 full; }; -struct _cpuid4_info_regs { +struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned int id; unsigned long size; - struct amd_northbridge *nb; }; -static unsigned short num_cache_leaves; +/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */ +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. +/* + * Fallback AMD CPUID(0x4) emulation + * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d) + * + * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should + * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006). + */ + +#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff +#define AMD_L2_L3_INVALID_ASSOC 0x9 - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ union l1_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; + unsigned line_size :8; + unsigned lines_per_tag :8; + unsigned assoc :8; + unsigned size_in_kb :8; }; - unsigned val; + unsigned int val; }; union l2_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned size_in_kb :16; }; - unsigned val; + unsigned int val; }; union l3_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned res :2; + unsigned size_encoded :14; }; - unsigned val; + unsigned int val; }; +/* L2/L3 associativity mapping */ static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ + [1] = 1, + [2] = 2, + [3] = 3, + [4] = 4, + [5] = 6, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE }; static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; - -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) { - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; + unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d, *l1; union l2_cache l2; union l3_cache l3; - union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; @@ -260,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - switch (leaf) { + l1 = &l1d; + switch (index) { case 1: l1 = &l1i; fallthrough; case 0: if (!l1->val) return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; + + assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; break; case 2: - if (!l2.val) + if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + + /* Use x86_cache_size as it might have K7 errata fixes */ + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: - if (!l3.val) + if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; + + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; } break; default: return; } - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = topology_num_cores_per_package(); - + eax->split.is_self_initializing = 1; + eax->split.type = types[index]; + eax->split.level = levels[index]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = topology_num_cores_per_package(); - if (assoc == 0xffff) + if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE) eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; } -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) +static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax, + union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx) { - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; + if (eax.split.type == CTYPE_NULL) + return -EIO; - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; + id4->eax = eax; + id4->ebx = ebx; + id4->ecx = ecx; + id4->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) -{ - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) +static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); + if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored); + else + legacy_amd_cpuid4(index, &eax, &ebx, &ecx); - return &cache_private_group; + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int node; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored); - node = topology_amd_node_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +static int fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } + u8 cpu_vendor = boot_cpu_data.x86_vendor; - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; + return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ? + amd_fill_cpuid4_info(index, id4) : + intel_fill_cpuid4_info(index, id4); } static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + /* Do a CPUID(op) loop to calculate num_cache_leaves */ + op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4; do { ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CTYPE_NULL); return i; } +/* + * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. + */ + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; if (c->x86 < 0x17) { - /* LLC is at the node level. */ + /* Pre-Zen: LLC is at the node level */ c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Family 17h up to 1F models: LLC is at the core + * complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } else { /* - * LLC ID is calculated from the number of threads sharing the - * cache. - * */ + * Newer families: LLC ID is calculated from the number + * of threads sharing the L3 cache. + */ u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; @@ -692,197 +320,167 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache); + int index_msb = get_count_order(num_sharing_cache); - c->topo.llc_id = c->topo.apicid >> bits; + c->topo.llc_id = c->topo.apicid >> index_msb; } } } void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Hygons are similar to AMD Family 17h up to 1F models: LLC is + * at the core complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } void init_amd_cacheinfo(struct cpuinfo_x86 *c) { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - num_cache_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) + ci->num_leaves = find_num_cache_leaves(c); + else if (c->extended_cpuid_level >= 0x80000006) + ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3; } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) { - num_cache_leaves = find_num_cache_leaves(c); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + + ci->num_leaves = find_num_cache_leaves(c); } -void init_intel_cacheinfo(struct cpuinfo_x86 *c) +static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3, + unsigned int l2, unsigned int l1i, unsigned int l1d) { - /* Cache sizes */ - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; + /* + * If llc_id is still unset, then cpuid_level < 4, which implies + * that the only possibility left is SMT. Since CPUID(0x2) doesn't + * specify any shared caches and SMT shares all caches, we can + * unconditionally set LLC ID to the package ID so that all + * threads share it. + */ + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; - if (c->cpuid_level > 3) { - static int is_initialized; + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d); - if (is_initialized == 0) { - /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(c); - is_initialized++; - } + if (!l2) + cpu_detect_cache_sizes(c); +} - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < num_cache_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; +/* + * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available. + */ +static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c) +{ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; + if (c->cpuid_level < 2) + return; - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) { + switch (desc->c_type) { + case CACHE_L1_INST: l1i += desc->c_size; break; + case CACHE_L1_DATA: l1d += desc->c_size; break; + case CACHE_L2: l2 += desc->c_size; break; + case CACHE_L3: l3 += desc->c_size; break; } } + + intel_cacheinfo_done(c, l3, l2, l1i, l1d); +} + +static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4) +{ + unsigned int num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + return c->topo.apicid & ~((1 << index_msb) - 1); +} + +static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID; + unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0; + + if (c->cpuid_level < 4) + return false; + /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been previously initialized. */ - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (num_cache_leaves != 0 && c->x86 == 15) - only_trace = 1; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } - } - } + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); - if (new_l1d) - l1d = new_l1d; + if (!ci->num_leaves) + return false; - if (new_l1i) - l1i = new_l1i; + for (int i = 0; i < ci->num_leaves; i++) { + struct _cpuid4_info id4 = {}; + int ret; - if (new_l2) { - l2 = new_l2; - c->topo.llc_id = l2_id; - c->topo.l2c_id = l2_id; - } + ret = intel_fill_cpuid4_info(i, &id4); + if (ret < 0) + continue; - if (new_l3) { - l3 = new_l3; - c->topo.llc_id = l3_id; + switch (id4.eax.split.level) { + case 1: + if (id4.eax.split.type == CTYPE_DATA) + l1d = id4.size / 1024; + else if (id4.eax.split.type == CTYPE_INST) + l1i = id4.size / 1024; + break; + case 2: + l2 = id4.size / 1024; + l2_id = calc_cache_topo_id(c, &id4); + break; + case 3: + l3 = id4.size / 1024; + l3_id = calc_cache_topo_id(c, &id4); + break; + default: + break; + } } - /* - * If llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->topo.pkg_id. - */ - if (c->topo.llc_id == BAD_APICID) - c->topo.llc_id = c->topo.pkg_id; + c->topo.l2c_id = l2_id; + c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); + return true; +} - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */ + if (intel_cacheinfo_0x4(c)) + return; - if (!l2) - cpu_detect_cache_sizes(c); + intel_cacheinfo_0x2(c); } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon + */ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci; - struct cacheinfo *this_leaf; + struct cacheinfo *ci; int i, sibling; /* @@ -894,18 +492,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, this_cpu_ci = get_cpu_cacheinfo(i); if (!this_cpu_ci->info_list) continue; - this_leaf = this_cpu_ci->info_list + index; + + ci = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - nshared = base->eax.split.num_threads_sharing + 1; + nshared = id4->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -919,14 +517,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if ((apicid < first) || (apicid > last)) continue; - this_leaf = this_cpu_ci->info_list + index; + ci = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else @@ -935,25 +532,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, return 1; } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon + */ static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cacheinfo *ci, *sibling_ci; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) { - if (__cache_amd_cpumap_setup(cpu, index, base)) + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, id4)) return; } - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + ci = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &ci->shared_cpu_map); if (num_threads_sharing == 1) return; @@ -963,78 +562,82 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + /* Skip if itself or no cacheinfo */ if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + continue; + + sibling_ci = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &ci->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map); } } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) +static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4, + struct amd_northbridge *nb) { - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; + ci->id = id4->id; + ci->attributes = CACHE_ID; + ci->level = id4->eax.split.level; + ci->type = cache_type_map[id4->eax.split.type]; + ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1; + ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1; + ci->size = id4->size; + ci->number_of_sets = id4->ecx.split.number_of_sets + 1; + ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1; + ci->priv = nb; } int init_cache_level(unsigned int cpu) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); - if (!num_cache_leaves) + /* There should be at least one leaf. */ + if (!ci->num_leaves) return -ENOENT; - if (!this_cpu_ci) - return -EINVAL; - this_cpu_ci->num_levels = 3; - this_cpu_ci->num_leaves = num_cache_leaves; + return 0; } /* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input * ECX as cache index. Then right shift apicid by the number's order to get * cache id for this cache node. */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +static void get_cache_id(int cpu, struct _cpuid4_info *id4) { struct cpuinfo_x86 *c = &cpu_data(cpu); unsigned long num_threads_sharing; int index_msb; - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->topo.apicid >> index_msb; + id4->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) { - unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; + struct cacheinfo *ci = this_cpu_ci->info_list; + u8 cpu_vendor = boot_cpu_data.x86_vendor; + struct amd_northbridge *nb = NULL; + struct _cpuid4_info id4 = {}; + int idx, ret; for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + ret = fill_cpuid4_info(idx, &id4); if (ret) return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); + + get_cache_id(cpu, &id4); + + if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) + nb = amd_init_l3_cache(idx); + + ci_info_init(ci++, &id4, nb); + __cache_cpumap_setup(cpu, idx, &id4); } - this_cpu_ci->cpu_map_populated = true; + this_cpu_ci->cpu_map_populated = true; return 0; } @@ -1050,31 +653,33 @@ int populate_cache_leaves(unsigned int cpu) static unsigned long saved_cr4; static DEFINE_RAW_SPINLOCK(cache_disable_lock); +/* + * Cache flushing is the most time-consuming step when programming the + * MTRRs. On many Intel CPUs without known erratas, it can be skipped + * if the CPU declares cache self-snooping support. + */ +static void maybe_flush_caches(void) +{ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + void cache_disable(void) __acquires(cache_disable_lock) { unsigned long cr0; /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots + * This is not ideal since the cache is only flushed/disabled + * for this CPU while the MTRRs are changed, but changing this + * requires more invasive changes to the way the kernel boots. */ - raw_spin_lock(&cache_disable_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_feature_enabled(X86_FEATURE_PGE)) { @@ -1089,9 +694,7 @@ void cache_disable(void) __acquires(cache_disable_lock) if (cpu_feature_enabled(X86_FEATURE_MTRR)) mtrr_disable(); - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); } void cache_enable(void) __releases(cache_disable_lock) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5c1e6d6be267..27125e009847 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,6 +29,7 @@ #include <asm/alternative.h> #include <asm/cmdline.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -68,6 +69,8 @@ #include <asm/traps.h> #include <asm/sev.h> #include <asm/tdx.h> +#include <asm/posted_intr.h> +#include <asm/runtime-const.h> #include "cpu.h" @@ -114,17 +117,17 @@ static const struct x86_cpu_id ppin_cpuids[] = { X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]), /* Legacy models without CPUID enumeration */ - X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), - X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]), + X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]), {} }; @@ -145,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c) */ info = (struct ppin_info *)id->driver_data; - if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + if (rdmsrq_safe(info->msr_ppin_ctl, &val)) goto clear_ppin; if ((val & 3UL) == 1UL) { @@ -155,19 +158,19 @@ static void ppin_init(struct cpuinfo_x86 *c) /* If PPIN is disabled, try to enable */ if (!(val & 2UL)) { - wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); - rdmsrl_safe(info->msr_ppin_ctl, &val); + wrmsrq_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrq_safe(info->msr_ppin_ctl, &val); } /* Is the enable bit set? */ if (val & 2UL) { - c->ppin = __rdmsr(info->msr_ppin); + c->ppin = native_rdmsrq(info->msr_ppin); set_cpu_cap(c, info->feature); return; } clear_ppin: - clear_cpu_cap(c, info->feature); + setup_clear_cpu_cap(info->feature); } static void default_init(struct cpuinfo_x86 *c) @@ -239,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) @@ -274,21 +278,13 @@ static int __init x86_noinvpcid_setup(char *s) } early_param("noinvpcid", x86_noinvpcid_setup); -#ifdef CONFIG_X86_32 -static int cachesize_override = -1; -static int disable_x86_serial_nr = 1; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - /* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) +static inline bool flag_is_changeable_p(unsigned long flag) { - u32 f1, f2; + unsigned long f1, f2; + + if (!IS_ENABLED(CONFIG_X86_32)) + return true; /* * Cyrix and IDT cpus allow disabling of CPUID @@ -311,11 +307,22 @@ static inline int flag_is_changeable_p(u32 flag) : "=&r" (f1), "=&r" (f2) : "ir" (flag)); - return ((f1^f2) & flag) != 0; + return (f1 ^ f2) & flag; } +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + /* Probe for the CPUID instruction */ -int have_cpuid_p(void) +bool cpuid_feature(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -347,10 +354,6 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else -static inline int flag_is_changeable_p(u32 flag) -{ - return 1; -} static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { } @@ -560,9 +563,9 @@ __noendbr u64 ibt_save(bool disable) u64 msr = 0; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); if (disable) - wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); } return msr; @@ -573,10 +576,10 @@ __noendbr void ibt_restore(u64 save) u64 msr; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); msr &= ~CET_ENDBR_EN; msr |= (save & CET_ENDBR_EN); - wrmsrl(MSR_IA32_S_CET, msr); + wrmsrq(MSR_IA32_S_CET, msr); } } @@ -600,15 +603,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_USER_SHSTK); if (kernel_ibt) - wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN); else - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); cr4_set_bits(X86_CR4_CET); if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); } } @@ -619,8 +622,8 @@ __noendbr void cet_disable(void) cpu_feature_enabled(X86_FEATURE_SHSTK))) return; - wrmsrl(MSR_IA32_S_CET, 0); - wrmsrl(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_U_CET, 0); } /* @@ -635,9 +638,9 @@ struct cpuid_dependent_feature { static const struct cpuid_dependent_feature cpuid_dependent_features[] = { - { X86_FEATURE_MWAIT, 0x00000005 }, - { X86_FEATURE_DCA, 0x00000009 }, - { X86_FEATURE_XSAVE, 0x0000000d }, + { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT }, + { X86_FEATURE_DCA, CPUID_LEAF_DCA }, + { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE }, { 0, 0 } }; @@ -665,8 +668,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -749,9 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * No need to load %gs. It is already correct. * * Writing %gs on 64bit would zero GSBASE which would make any per - * CPU operation up to the point of the wrmsrl() fault. + * CPU operation up to the point of the wrmsrq() fault. * - * Set GSBASE to the new offset. Until the wrmsrl() happens the + * Set GSBASE to the new offset. Until the wrmsrq() happens the * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). @@ -759,7 +762,7 @@ void __init switch_gdt_and_percpu_base(int cpu) * This works even with stackprotector enabled because the * per CPU stack canary is 0 in both per CPU areas. */ - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); + wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else /* * %fs is already set to __KERNEL_PERCPU, but after switching GDT @@ -844,13 +847,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -858,15 +861,13 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } -static void get_cpu_vendor(struct cpuinfo_x86 *c) +void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -1005,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* AMD-defined flags: level 0x80000001 */ + /* + * Check if extended CPUID leaves are implemented: Max extended + * CPUID leaf must be in the 0x80000001-0x8000ffff range. + */ eax = cpuid_eax(0x80000000); - c->extended_cpuid_level = eax; + c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0; - if ((eax & 0xffff0000) == 0x80000000) { - if (eax >= 0x80000001) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (c->extended_cpuid_level >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_8000_0001_ECX] = ecx; - c->x86_capability[CPUID_8000_0001_EDX] = edx; - } + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } if (c->extended_cpuid_level >= 0x80000007) { @@ -1053,18 +1055,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c) void get_cpu_address_sizes(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - bool vp_bits_from_cpuid = true; if (!cpu_has(c, X86_FEATURE_CPUID) || - (c->extended_cpuid_level < 0x80000008)) - vp_bits_from_cpuid = false; - - if (vp_bits_from_cpuid) { - cpuid(0x80000008, &eax, &ebx, &ecx, &edx); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } else { + (c->extended_cpuid_level < 0x80000008)) { if (IS_ENABLED(CONFIG_X86_64)) { c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -1078,14 +1071,23 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_PSE36)) c->x86_phys_bits = 36; } + } else { + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + + /* Provide a sane default if not enumerated: */ + if (!c->x86_clflush_size) + c->x86_clflush_size = 32; } + c->x86_cache_bits = c->x86_phys_bits; c->x86_cache_alignment = c->x86_clflush_size; } static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { -#ifdef CONFIG_X86_32 int i; /* @@ -1106,7 +1108,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) break; } } -#endif } #define NO_SPECULATION BIT(0) @@ -1120,12 +1121,13 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SPECTRE_V2 BIT(8) #define NO_MMIO BIT(9) #define NO_EIBRS_PBRSB BIT(10) +#define NO_BHI BIT(11) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) -#define VULNWL_INTEL(model, whitelist) \ - VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist) +#define VULNWL_INTEL(vfm, whitelist) \ + X86_MATCH_VFM(vfm, whitelist) #define VULNWL_AMD(family, whitelist) \ VULNWL(AMD, family, X86_MODEL_ANY, whitelist) @@ -1142,32 +1144,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(TIGERLAKE, NO_MMIO), - VULNWL_INTEL(TIGERLAKE_L, NO_MMIO), - VULNWL_INTEL(ALDERLAKE, NO_MMIO), - VULNWL_INTEL(ALDERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO), + VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO), - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(CORE_YONAH, NO_SSB), + VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1177,33 +1179,34 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI), /* Zhaoxin Family 7 */ - VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), - VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO), + VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), + VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI), {} }; #define VULNBL(vendor, family, model, blacklist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) -#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ - X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ - INTEL_FAM6_##model, steppings, \ - X86_FEATURE_ANY, issues) +#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \ + X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues) + +#define VULNBL_INTEL_TYPE(vfm, cpu_type, issues) \ + X86_MATCH_VFM_CPU_TYPE(vfm, INTEL_CPU_TYPE_##cpu_type, issues) #define VULNBL_AMD(family, blacklist) \ VULNBL(AMD, family, X86_MODEL_ANY, blacklist) @@ -1226,51 +1229,59 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { - VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), - VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE_P, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(RAPTORLAKE_S, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS), - VULNBL_INTEL_STEPPINGS(ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS), + VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), + VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), + VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), + VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS), + VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS), VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO), VULNBL_AMD(0x19, SRSO), + VULNBL_AMD(0x1a, SRSO), {} }; @@ -1283,25 +1294,25 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi u64 x86_read_arch_cap_msr(void) { - u64 ia32_cap = 0; + u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); - return ia32_cap; + return x86_arch_cap_msr; } -static bool arch_cap_mmio_immune(u64 ia32_cap) +static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr) { - return (ia32_cap & ARCH_CAP_FBSDP_NO && - ia32_cap & ARCH_CAP_PSDP_NO && - ia32_cap & ARCH_CAP_SBDR_SSDP_NO); + return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO && + x86_arch_cap_msr & ARCH_CAP_PSDP_NO && + x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO); } -static bool __init vulnerable_to_rfds(u64 ia32_cap) +static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) { /* The "immunity" bit trumps everything else: */ - if (ia32_cap & ARCH_CAP_RFDS_NO) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO) return false; /* @@ -1309,20 +1320,88 @@ static bool __init vulnerable_to_rfds(u64 ia32_cap) * indicate that mitigation is needed because guest is running on a * vulnerable hardware or may migrate to such hardware: */ - if (ia32_cap & ARCH_CAP_RFDS_CLEAR) + if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) return true; /* Only consult the blacklist when there is no enumeration: */ return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + +static struct x86_cpu_id cpu_latest_microcode[] = { +#include "microcode/intel-ucode-defs.h" + {} +}; + +static bool __init cpu_has_old_microcode(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode); + + /* Give unknown CPUs a pass: */ + if (!m) { + /* Intel CPUs should be in the list. Warn if not: */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + pr_info("x86/CPU: Model not found in latest microcode list\n"); + return false; + } + + /* + * Hosts usually lie to guests with a super high microcode + * version. Just ignore what hosts tell guests: + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* Consider all debug microcode to be old: */ + if (boot_cpu_data.microcode & BIT(31)) + return true; + + /* Give new microcode a pass: */ + if (boot_cpu_data.microcode >= m->driver_data) + return false; + + /* Uh oh, too old: */ + return true; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { - u64 ia32_cap = x86_read_arch_cap_msr(); + u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); + + if (cpu_has_old_microcode()) { + pr_warn("x86/CPU: Running old microcode\n"); + setup_force_cpu_bug(X86_BUG_OLD_MICROCODE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && - !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION)) @@ -1330,11 +1409,13 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) + if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2)) { setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2_USER); + } if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) && - !(ia32_cap & ARCH_CAP_SSB_NO) && + !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1345,17 +1426,17 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Don't use AutoIBRS when SNP is enabled because it degrades host * userspace indirect branch performance. */ - if ((ia32_cap & ARCH_CAP_IBRS_ALL) || + if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) || (cpu_has(c, X86_FEATURE_AUTOIBRS) && !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) { setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && - !(ia32_cap & ARCH_CAP_PBRSB_NO)) + !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO)) setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); } if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) && - !(ia32_cap & ARCH_CAP_MDS_NO)) { + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) { setup_force_cpu_bug(X86_BUG_MDS); if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY)) setup_force_cpu_bug(X86_BUG_MSBDS_ONLY); @@ -1374,9 +1455,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * TSX_CTRL check alone is not sufficient for cases when the microcode * update is not present or running as guest that don't get TSX_CTRL. */ - if (!(ia32_cap & ARCH_CAP_TAA_NO) && + if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) && (cpu_has(c, X86_FEATURE_RTM) || - (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); /* @@ -1398,19 +1479,14 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. - * - * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, - * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ - if (!arch_cap_mmio_immune(ia32_cap)) { + if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); - else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) - setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { - if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA)) + if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA)) setup_force_cpu_bug(X86_BUG_RETBLEED); } @@ -1428,18 +1504,37 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * disabling AVX2. The only way to do this in HW is to clear XCR0[2], * which means that AVX will be disabled. */ - if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) && + if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) && boot_cpu_has(X86_FEATURE_AVX)) setup_force_cpu_bug(X86_BUG_GDS); - if (vulnerable_to_rfds(ia32_cap)) + if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && + (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || + boot_cpu_has(X86_FEATURE_HYPERVISOR))) + setup_force_cpu_bug(X86_BUG_BHI); + + if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) + setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + + if (vulnerable_to_its(x86_arch_cap_msr)) { + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; /* Rogue Data Cache Load? No! */ - if (ia32_cap & ARCH_CAP_RDCL_NO) + if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO) return; setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); @@ -1468,66 +1563,38 @@ static void detect_nopl(void) #endif } -/* - * We parse cpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init cpu_parse_early_param(void) +static inline bool parse_set_clear_cpuid(char *arg, bool set) { - char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; + char *opt; + int taint = 0; -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option_bool(boot_command_line, "nousershstk")) - setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); - - arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; - - pr_info("Clearing CPUID bits:"); - - while (argptr) { + while (arg) { bool found __maybe_unused = false; unsigned int bit; - opt = strsep(&argptr, ","); + opt = strsep(&arg, ","); /* * Handle naked numbers first for feature flags which don't - * have names. + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. */ if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + pr_cont(" %d:%d\n", bit >> 5, bit & 31); else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + pr_cont(" %s\n", x86_cap_flags[bit]); - setup_clear_cpu_cap(bit); taint++; } /* @@ -1537,27 +1604,97 @@ static void __init cpu_parse_early_param(void) continue; } - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + const char *kind; + + if (bit < 32 * NCAPINTS) { + flag = x86_cap_flags[bit]; + kind = "feature"; + } else { + kind = "bug"; + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } + + if (!flag) continue; - if (strcmp(x86_cap_flag(bit), opt)) + if (strcmp(flag, opt)) continue; - pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); + setup_clear_cpu_cap(bit); + } taint++; found = true; break; } if (!found) - pr_cont(" (unknown: %s)", opt); + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); } - pr_cont("\n"); - if (taint) + return taint; +} + + +/* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + bool cpuid_taint = false; + char arg[128]; + int arglen; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + + /* Minimize the gap between FRED is available and available but disabled. */ + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); + if (arglen != 2 || strncmp(arg, "on", 2)) + setup_clear_cpu_cap(X86_FEATURE_FRED); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, false); + + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + cpuid_taint |= parse_set_clear_cpuid(arg, true); + + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* @@ -1574,13 +1711,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (have_cpuid_p()) { + if (cpuid_feature()) { cpu_detect(c); get_cpu_vendor(c); + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); setup_force_cpu_cap(X86_FEATURE_CPUID); get_cpu_address_sizes(c); @@ -1593,6 +1731,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; filter_cpuid_features(c, false); + check_cpufeature_deps(c); if (this_cpu->c_bsp_init) this_cpu->c_bsp_init(c); @@ -1634,15 +1773,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) detect_nopl(); } -void __init early_cpu_init(void) +void __init init_cpu_devs(void) { const struct cpu_dev *const *cdev; int count = 0; -#ifdef CONFIG_PROCESSOR_SELECT - pr_info("KERNEL supported cpus:\n"); -#endif - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { const struct cpu_dev *cpudev = *cdev; @@ -1650,20 +1785,30 @@ void __init early_cpu_init(void) break; cpu_devs[count] = cpudev; count++; + } +} +void __init early_cpu_init(void) +{ #ifdef CONFIG_PROCESSOR_SELECT - { - unsigned int j; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - pr_info(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } + unsigned int i, j; + + pr_info("KERNEL supported cpus:\n"); #endif + + init_cpu_devs(); + +#ifdef CONFIG_PROCESSOR_SELECT + for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) { + for (j = 0; j < 2; j++) { + if (!cpu_devs[i]->c_ident[j]) + continue; + pr_info(" %s %s\n", cpu_devs[i]->c_vendor, + cpu_devs[i]->c_ident[j]); + } } +#endif + early_identify_cpu(&boot_cpu_data); } @@ -1685,11 +1830,11 @@ static bool detect_null_seg_behavior(void) */ unsigned long old_base, tmp; - rdmsrl(MSR_FS_BASE, old_base); - wrmsrl(MSR_FS_BASE, 1); + rdmsrq(MSR_FS_BASE, old_base); + wrmsrq(MSR_FS_BASE, 1); loadsegment(fs, 0); - rdmsrl(MSR_FS_BASE, tmp); - wrmsrl(MSR_FS_BASE, old_base); + rdmsrq(MSR_FS_BASE, tmp); + wrmsrq(MSR_FS_BASE, old_base); return tmp == 0; } @@ -1730,17 +1875,17 @@ static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) + if (!cpuid_feature()) return; cpu_detect(c); get_cpu_vendor(c); - + intel_unlock_cpuid_leafs(c); get_cpu_cap(c); get_cpu_address_sizes(c); @@ -1823,6 +1968,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_init) this_cpu->c_init(c); + bus_lock_init(); + /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); @@ -1845,6 +1992,9 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Filter out anything that depends on CPUID levels we don't have */ filter_cpuid_features(c, true); + /* Check for unmet dependencies based on the CPUID dependency table */ + check_cpufeature_deps(c); + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { const char *p; @@ -1888,9 +2038,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_cpu_init(c); -#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); -#endif } /* @@ -1915,9 +2063,9 @@ void enable_sep_cpu(void) */ tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1); + wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32); put_cpu(); } @@ -1939,9 +2087,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -1952,6 +2106,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) @@ -1982,29 +2137,42 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); -DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { - .current_task = &init_task, - .preempt_count = INIT_PREEMPT_COUNT, - .top_of_stack = TOP_OF_INIT_STACK, -}; -EXPORT_PER_CPU_SYMBOL(pcpu_hot); -EXPORT_PER_CPU_SYMBOL(const_pcpu_hot); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + +DEFINE_PER_CPU_CACHE_HOT(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +EXPORT_PER_CPU_SYMBOL(const_current_task); + +DEFINE_PER_CPU_CACHE_HOT(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK; #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, - fixed_percpu_data) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); +/* + * Note: Do not make this dependant on CONFIG_MITIGATION_CALL_DEPTH_TRACKING + * so that this space is reserved in the hot cache section even when the + * mitigation is disabled. + */ +DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); +EXPORT_PER_CPU_SYMBOL(__x86_call_depth); -static void wrmsrl_cstar(unsigned long val) +static void wrmsrq_cstar(unsigned long val) { /* * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR @@ -2012,37 +2180,37 @@ static void wrmsrl_cstar(unsigned long val) * guest. Avoid the pointless write on all Intel CPUs. */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - wrmsrl(MSR_CSTAR, val); + wrmsrq(MSR_CSTAR, val); } static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + wrmsrq_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { - wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ - wrmsrl(MSR_SYSCALL_MASK, + wrmsrq(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| @@ -2066,30 +2234,25 @@ void syscall_init(void) if (!cpu_feature_enabled(X86_FEATURE_FRED)) idt_syscall_init(); } - -#else /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_STACKPROTECTOR -DEFINE_PER_CPU(unsigned long, __stack_chk_guard); +DEFINE_PER_CPU_CACHE_HOT(unsigned long, __stack_chk_guard); +#ifndef CONFIG_SMP EXPORT_PER_CPU_SYMBOL(__stack_chk_guard); #endif +#endif -#endif /* CONFIG_X86_64 */ - -/* - * Clear all 6 debug registers: - */ -static void clear_all_debug_regs(void) +static void initialize_debug_regs(void) { - int i; - - for (i = 0; i < 8; i++) { - /* Ignore db4, db5 */ - if ((i == 4) || (i == 5)) - continue; - - set_debugreg(0, i); - } + /* Control register first -- to make sure everything is disabled. */ + set_debugreg(DR7_FIXED_1, 7); + set_debugreg(DR6_RESERVED, 6); + /* dr5 and dr4 don't exist */ + set_debugreg(0, 3); + set_debugreg(0, 2); + set_debugreg(0, 1); + set_debugreg(0, 0); } #ifdef CONFIG_KGDB @@ -2112,7 +2275,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - wrmsr(MSR_TSC_AUX, cpudata, 0); + wrmsrq(MSR_TSC_AUX, cpudata); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -2162,7 +2325,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss) * Setup everything needed to handle exceptions from the IDT, including the IST * exceptions which use paranoid_entry(). */ -void cpu_init_exception_handling(void) +void cpu_init_exception_handling(bool boot_cpu) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); int cpu = raw_smp_processor_id(); @@ -2181,10 +2344,23 @@ void cpu_init_exception_handling(void) /* GHCB needs to be setup to handle #VC. */ setup_ghcb(); + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + /* The boot CPU has enabled FRED during early boot */ + if (!boot_cpu) + cpu_init_fred_exceptions(); + + cpu_init_fred_rsps(); + } else { + load_current_idt(); + } +} + +void __init cpu_init_replace_early_idt(void) +{ if (cpu_feature_enabled(X86_FEATURE_FRED)) cpu_init_fred_exceptions(); else - load_current_idt(); + idt_setup_early_pf(); } /* @@ -2214,11 +2390,13 @@ void cpu_init(void) memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); + wrmsrq(MSR_FS_BASE, 0); + wrmsrq(MSR_KERNEL_GS_BASE, 0); barrier(); x2apic_setup(); + + intel_posted_msi_init(); } mmgrab(&init_mm); @@ -2235,7 +2413,7 @@ void cpu_init(void) load_mm_ldt(&init_mm); - clear_all_debug_regs(); + initialize_debug_regs(); dbg_restore_debug_regs(); doublefault_init_cpu_tss(); @@ -2357,6 +2535,15 @@ void __init arch_cpu_finalize_init(void) alternative_instructions(); if (IS_ENABLED(CONFIG_X86_64)) { + unsigned long USER_PTR_MAX = TASK_SIZE_MAX; + + /* + * Enable this when LAM is gated on LASS support + if (cpu_feature_enabled(X86_FEATURE_LAM)) + USER_PTR_MAX = (1ul << 63) - PAGE_SIZE; + */ + runtime_const_init(ptr, USER_PTR_MAX); + /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ea9e07d57c8d..bc38b2d56f26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ @@ -61,9 +53,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; extern void __init tsx_init(void); void tsx_ap_init(void); +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c); #else static inline void tsx_init(void) { } static inline void tsx_ap_init(void) { } +static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { } #endif /* CONFIG_CPU_SUP_INTEL */ extern void init_spectral_chicken(struct cpuinfo_x86 *c); @@ -81,6 +75,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +struct amd_northbridge *amd_init_l3_cache(int index); +#else +static inline struct amd_northbridge *amd_init_l3_cache(int index) +{ + return NULL; +} +#endif + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index b7174209d855..46efcbd6afa4 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, @@ -44,7 +45,11 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_VAES, X86_FEATURE_AVX }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, @@ -56,9 +61,6 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, - { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, - { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, @@ -81,9 +83,12 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, - { X86_FEATURE_FRED, X86_FEATURE_WRMSRNS }, + { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, {} }; @@ -114,6 +119,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) if (WARN_ON(feature >= MAX_FEATURE_BITS)) return; + if (boot_cpu_has(feature)) + WARN_ON(alternatives_patched); + clear_feature(c, feature); /* Collect all features to disable, handling dependencies */ @@ -144,3 +152,38 @@ void setup_clear_cpu_cap(unsigned int feature) { do_clear_cpu_cap(NULL, feature); } + +/* + * Return the feature "name" if available, otherwise return + * the X86_FEATURE_* numerals to make it easier to identify + * the feature. + */ +static const char *x86_feature_name(unsigned int feature, char *buf) +{ + if (x86_cap_flags[feature]) + return x86_cap_flags[feature]; + + snprintf(buf, 16, "%d*32+%2d", feature / 32, feature % 32); + + return buf; +} + +void check_cpufeature_deps(struct cpuinfo_x86 *c) +{ + char feature_buf[16], depends_buf[16]; + const struct cpuid_dep *d; + + for (d = cpuid_deps; d->feature; d++) { + if (cpu_has(c, d->feature) && !cpu_has(c, d->depends)) { + /* + * Only warn about the first unmet dependency on the + * first CPU where it is encountered to avoid spamming + * the kernel log. + */ + pr_warn_once("x86 CPU feature dependency check failure: CPU%d has '%s' enabled but '%s' disabled. Kernel might be fine, but no guarantees.\n", + smp_processor_id(), + x86_feature_name(d->feature, feature_buf), + x86_feature_name(d->depends, depends_buf)); + } + } +} diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c new file mode 100644 index 000000000000..89bc8db5e9c6 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sizes.h> + +#include <asm/cpuid/types.h> + +#include "cpu.h" + +#define CACHE_ENTRY(_desc, _type, _size) \ + [_desc] = { \ + .c_type = (_type), \ + .c_size = (_size) / SZ_1K, \ + } + +#define TLB_ENTRY(_desc, _type, _entries) \ + [_desc] = { \ + .t_type = (_type), \ + .entries = (_entries), \ + } + +const struct leaf_0x2_table cpuid_0x2_table[256] = { + CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */ + + TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */ + TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */ + TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */ + TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */ + TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */ + TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */ + TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */ +}; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1..dfec2c61e354 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index 3baf3e435834..1976fef2dfe5 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,14 +16,16 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); seq_printf(m, "core_id: %u\n", c->topo.core_id); + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); seq_printf(m, "llc_id: %u\n", c->topo.llc_id); seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 1640ae76548f..d69757246bde 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) bool enable_vmx; u64 msr; - if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); clear_cpu_cap(c, X86_FEATURE_SGX); return; @@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_SGX_LC_ENABLED; } - wrmsrl(MSR_IA32_FEAT_CTL, msr); + wrmsrq(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); @@ -188,7 +189,7 @@ update_caps: update_sgx: if (!(msr & FEAT_CTL_SGX_ENABLED)) { if (enable_sgx_kvm || enable_sgx_driver) - pr_err_once("SGX disabled by BIOS.\n"); + pr_err_once("SGX disabled or unsupported by BIOS.\n"); clear_cpu_cap(c, X86_FEATURE_SGX); return; } diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f2..2154f12766fb 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -15,6 +15,7 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> +#include <asm/msr.h> #include "cpu.h" @@ -96,7 +97,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -110,7 +111,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; @@ -194,7 +195,7 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -240,26 +241,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be30d7fa2e66..076eaa41b8c8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,68 +1,33 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/pgtable.h> -#include <linux/string.h> #include <linux/bitops.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/sched/clock.h> -#include <linux/semaphore.h> -#include <linux/thread_info.h> #include <linux/init.h> -#include <linux/uaccess.h> -#include <linux/workqueue.h> -#include <linux/delay.h> -#include <linux/cpuhotplug.h> +#include <linux/kernel.h> +#include <linux/minmax.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/types.h> + +#ifdef CONFIG_X86_64 +#include <linux/topology.h> +#endif -#include <asm/cpufeature.h> -#include <asm/msr.h> #include <asm/bugs.h> +#include <asm/cpu_device_id.h> +#include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> +#include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> -#include <asm/hwcap2.h> -#include <asm/elf.h> -#include <asm/cpu_device_id.h> -#include <asm/cmdline.h> -#include <asm/traps.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/numa.h> +#include <asm/resctrl.h> #include <asm/thermal.h> - -#ifdef CONFIG_X86_64 -#include <linux/topology.h> -#endif +#include <asm/uaccess.h> #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#endif - -enum split_lock_detect_state { - sld_off = 0, - sld_warn, - sld_fatal, - sld_ratelimit, -}; - -/* - * Default to sld_off because most systems do not support split lock detection. - * sld_state_setup() will switch this to sld_warn on systems that support - * split lock/bus lock detect, unless there is a command line override. - */ -static enum split_lock_detect_state sld_state __ro_after_init = sld_off; -static u64 msr_test_ctrl_cache __ro_after_init; - -/* - * With a name like MSR_TEST_CTL it should go without saying, but don't touch - * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it - * on CPUs that do not support SLD can cause fireworks, even when writing '0'. - */ -static bool cpu_model_supports_sld __ro_after_init; - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists @@ -72,19 +37,19 @@ static bool cpu_model_supports_sld __ro_after_init; */ static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) { - switch (c->x86_model) { - case INTEL_FAM6_CORE_YONAH: - case INTEL_FAM6_CORE2_MEROM: - case INTEL_FAM6_CORE2_MEROM_L: - case INTEL_FAM6_CORE2_PENRYN: - case INTEL_FAM6_CORE2_DUNNINGTON: - case INTEL_FAM6_NEHALEM: - case INTEL_FAM6_NEHALEM_G: - case INTEL_FAM6_NEHALEM_EP: - case INTEL_FAM6_NEHALEM_EX: - case INTEL_FAM6_WESTMERE: - case INTEL_FAM6_WESTMERE_EP: - case INTEL_FAM6_SANDYBRIDGE: + switch (c->x86_vfm) { + case INTEL_CORE_YONAH: + case INTEL_CORE2_MEROM: + case INTEL_CORE2_MEROM_L: + case INTEL_CORE2_PENRYN: + case INTEL_CORE2_DUNNINGTON: + case INTEL_NEHALEM: + case INTEL_NEHALEM_G: + case INTEL_NEHALEM_EP: + case INTEL_NEHALEM_EX: + case INTEL_WESTMERE: + case INTEL_WESTMERE_EP: + case INTEL_SANDYBRIDGE: setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); } } @@ -106,9 +71,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) */ if (c->x86 != 6) return; - switch (c->x86_model) { - case INTEL_FAM6_XEON_PHI_KNL: - case INTEL_FAM6_XEON_PHI_KNM: + switch (c->x86_vfm) { + case INTEL_XEON_PHI_KNL: + case INTEL_XEON_PHI_KNM: break; default: return; @@ -134,32 +99,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) * - Release note from 20180108 microcode release */ struct sku_microcode { - u8 model; + u32 vfm; u8 stepping; u32 microcode; }; static const struct sku_microcode spectre_bad_microcodes[] = { - { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE, 0x09, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 }, - { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 }, - { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, - { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_BROADWELL, 0x04, 0x28 }, - { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b }, - { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 }, - { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 }, - { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 }, - { INTEL_FAM6_HASWELL_L, 0x01, 0x21 }, - { INTEL_FAM6_HASWELL_G, 0x01, 0x18 }, - { INTEL_FAM6_HASWELL, 0x03, 0x23 }, - { INTEL_FAM6_HASWELL_X, 0x02, 0x3b }, - { INTEL_FAM6_HASWELL_X, 0x04, 0x10 }, - { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a }, + { INTEL_KABYLAKE, 0x0B, 0x80 }, + { INTEL_KABYLAKE, 0x0A, 0x80 }, + { INTEL_KABYLAKE, 0x09, 0x80 }, + { INTEL_KABYLAKE_L, 0x0A, 0x80 }, + { INTEL_KABYLAKE_L, 0x09, 0x80 }, + { INTEL_SKYLAKE_X, 0x03, 0x0100013e }, + { INTEL_SKYLAKE_X, 0x04, 0x0200003c }, + { INTEL_BROADWELL, 0x04, 0x28 }, + { INTEL_BROADWELL_G, 0x01, 0x1b }, + { INTEL_BROADWELL_D, 0x02, 0x14 }, + { INTEL_BROADWELL_D, 0x03, 0x07000011 }, + { INTEL_BROADWELL_X, 0x01, 0x0b000025 }, + { INTEL_HASWELL_L, 0x01, 0x21 }, + { INTEL_HASWELL_G, 0x01, 0x18 }, + { INTEL_HASWELL, 0x03, 0x23 }, + { INTEL_HASWELL_X, 0x02, 0x3b }, + { INTEL_HASWELL_X, 0x04, 0x10 }, + { INTEL_IVYBRIDGE_X, 0x04, 0x42a }, /* Observed in the wild */ - { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b }, - { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 }, + { INTEL_SANDYBRIDGE_X, 0x06, 0x61b }, + { INTEL_SANDYBRIDGE_X, 0x07, 0x712 }, }; static bool bad_spectre_microcode(struct cpuinfo_x86 *c) @@ -173,11 +138,8 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return false; - if (c->x86 != 6) - return false; - for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { - if (c->x86_model == spectre_bad_microcodes[i].model && + if (c->x86_vfm == spectre_bad_microcodes[i].vfm && c->x86_stepping == spectre_bad_microcodes[i].stepping) return (c->microcode <= spectre_bad_microcodes[i].microcode); } @@ -190,101 +152,57 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) #define TME_ACTIVATE_LOCKED(x) (x & 0x1) #define TME_ACTIVATE_ENABLED(x) (x & 0x2) -#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */ -#define TME_ACTIVATE_POLICY_AES_XTS_128 0 - #define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */ -#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */ -#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1 - -/* Values for mktme_status (SW only construct) */ -#define MKTME_ENABLED 0 -#define MKTME_DISABLED 1 -#define MKTME_UNINITIALIZED 2 -static int mktme_status = MKTME_UNINITIALIZED; - static void detect_tme_early(struct cpuinfo_x86 *c) { - u64 tme_activate, tme_policy, tme_crypto_algs; - int keyid_bits = 0, nr_keyids = 0; - static u64 tme_activate_cpu0 = 0; - - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + u64 tme_activate; + int keyid_bits; - if (mktme_status != MKTME_UNINITIALIZED) { - if (tme_activate != tme_activate_cpu0) { - /* Broken BIOS? */ - pr_err_once("x86/tme: configuration is inconsistent between CPUs\n"); - pr_err_once("x86/tme: MKTME is not usable\n"); - mktme_status = MKTME_DISABLED; - - /* Proceed. We may need to exclude bits from x86_phys_bits. */ - } - } else { - tme_activate_cpu0 = tme_activate; - } + rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate); if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); - mktme_status = MKTME_DISABLED; + clear_cpu_cap(c, X86_FEATURE_TME); return; } + pr_info_once("x86/tme: enabled by BIOS\n"); + keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); + if (!keyid_bits) + return; - if (mktme_status != MKTME_UNINITIALIZED) - goto detect_keyid_bits; - - pr_info("x86/tme: enabled by BIOS\n"); - - tme_policy = TME_ACTIVATE_POLICY(tme_activate); - if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128) - pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy); + /* + * KeyID bits are set by BIOS and can be present regardless + * of whether the kernel is using them. They effectively lower + * the number of physical address bits. + * + * Update cpuinfo_x86::x86_phys_bits accordingly. + */ + c->x86_phys_bits -= keyid_bits; + pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n", + keyid_bits); +} - tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate); - if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) { - pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n", - tme_crypto_algs); - mktme_status = MKTME_DISABLED; - } -detect_keyid_bits: - keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate); - nr_keyids = (1UL << keyid_bits) - 1; - if (nr_keyids) { - pr_info_once("x86/mktme: enabled by BIOS\n"); - pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids); - } else { - pr_info_once("x86/mktme: disabled by BIOS\n"); - } +void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (mktme_status == MKTME_UNINITIALIZED) { - /* MKTME is usable */ - mktme_status = MKTME_ENABLED; - } + if (c->x86_vfm < INTEL_PENTIUM_M_DOTHAN) + return; /* - * KeyID bits effectively lower the number of physical address - * bits. Update cpuinfo_x86::x86_phys_bits accordingly. + * The BIOS can have limited CPUID to leaf 2, which breaks feature + * enumeration. Unlock it and update the maximum leaf info. */ - c->x86_phys_bits -= keyid_bits; + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) + c->cpuid_level = cpuid_eax(0); } static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; - /* Unmask CPUID levels if masked: */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { - c->cpuid_level = cpuid_eax(0); - get_cpu_cap(c); - } - } - - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) c->microcode = intel_get_microcode_revision(); @@ -312,7 +230,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 && + if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 && c->microcode < 0x20e) { pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); @@ -327,8 +245,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) #endif /* CPUID workaround for 0F33/0F34 CPU */ - if (c->x86 == 0xF && c->x86_model == 0x3 - && (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) + if (c->x86_vfm == INTEL_P4_PRESCOTT && + (c->x86_stepping == 0x3 || c->x86_stepping == 0x4)) c->x86_phys_bits = 36; /* @@ -337,46 +255,57 @@ static void early_init_intel(struct cpuinfo_x86 *c) * * It is also reliable across cores and sockets. (but not across * cabinets - we turn it off in that case explicitly.) + * + * Use a model-specific check for some older CPUs that have invariant + * TSC but may not report it architecturally via 8000_0007. */ if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } else if ((c->x86_vfm >= INTEL_P4_PRESCOTT && c->x86_vfm <= INTEL_P4_WILLAMETTE) || + (c->x86_vfm >= INTEL_CORE_YONAH && c->x86_vfm <= INTEL_IVYBRIDGE)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ - if (c->x86 == 6) { - switch (c->x86_model) { - case INTEL_FAM6_ATOM_SALTWELL_MID: - case INTEL_FAM6_ATOM_SALTWELL_TABLET: - case INTEL_FAM6_ATOM_SILVERMONT_MID: - case INTEL_FAM6_ATOM_AIRMONT_NP: - set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); - break; - default: - break; - } + switch (c->x86_vfm) { + case INTEL_ATOM_SALTWELL_MID: + case INTEL_ATOM_SALTWELL_TABLET: + case INTEL_ATOM_SILVERMONT_MID: + case INTEL_ATOM_AIRMONT_NP: + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; } /* - * There is a known erratum on Pentium III and Core Solo - * and Core Duo CPUs. - * " Page with PAT set to WC while associated MTRR is UC - * may consolidate to UC " - * Because of this erratum, it is better to stick with - * setting WC in MTRR rather than using PAT on these CPUs. + * PAT is broken on early family 6 CPUs, the last of which + * is "Yonah" where the erratum is named "AN7": + * + * Page with PAT (Page Attribute Table) Set to USWC + * (Uncacheable Speculative Write Combine) While + * Associated MTRR (Memory Type Range Register) Is UC + * (Uncacheable) May Consolidate to UC * - * Enable PAT WC only on P4, Core 2 or later CPUs. + * Disable PAT and fall back to MTRR on these CPUs. */ - if (c->x86 == 6 && c->x86_model < 15) + if (c->x86_vfm >= INTEL_PENTIUM_PRO && + c->x86_vfm <= INTEL_CORE_YONAH) clear_cpu_cap(c, X86_FEATURE_PAT); /* - * If fast string is not enabled in IA32_MISC_ENABLE for any reason, - * clear the fast string and enhanced fast string CPU capabilities. + * Modern CPUs are generally expected to have a sane fast string + * implementation. However, BIOSes typically have a knob to tweak + * the architectural MISC_ENABLE.FAST_STRING enable bit. + * + * Adhere to the preference and program the Linux-defined fast + * string flag and enhanced fast string capabilities accordingly. */ - if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { + rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { + /* X86_FEATURE_ERMS is set based on CPUID */ + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } else { pr_info("Disabled fast string operations\n"); setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); setup_clear_cpu_cap(X86_FEATURE_ERMS); @@ -393,7 +322,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE * to be modified. */ - if (c->x86 == 5 && c->x86_model == 9) { + if (c->x86_vfm == INTEL_QUARK_X1000) { pr_info("Disabling PGE capability bit\n"); setup_clear_cpu_cap(X86_FEATURE_PGE); } @@ -423,9 +352,7 @@ static void bsp_init_intel(struct cpuinfo_x86 *c) int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping < 8) { pr_info("Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; @@ -442,9 +369,8 @@ static void intel_smp_check(struct cpuinfo_x86 *c) /* * Mask B, Pentium, but not Pentium MMX */ - if (c->x86 == 5 && - c->x86_stepping >= 1 && c->x86_stepping <= 4 && - c->x86_model <= 3) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_PENTIUM_MMX && + c->x86_stepping >= 1 && c->x86_stepping <= 4) { /* * Remember we have B step Pentia with bugs */ @@ -471,7 +397,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * The Quark is also family 5, but does not have the same bug. */ clear_cpu_bug(c, X86_BUG_F00F); - if (c->x86 == 5 && c->x86_model < 9) { + if (c->x86_vfm >= INTEL_FAM5_START && c->x86_vfm < INTEL_QUARK_X1000) { static int f00f_workaround_enabled; set_cpu_bug(c, X86_BUG_F00F); @@ -486,7 +412,8 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until * model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_stepping) < 0x633) + if ((c->x86_vfm == INTEL_PENTIUM_II_KLAMATH && c->x86_stepping < 3) || + c->x86_vfm < INTEL_PENTIUM_II_KLAMATH) clear_cpu_cap(c, X86_FEATURE_SEP); /* @@ -504,7 +431,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_stepping == 1)) { + if (c->x86_vfm == INTEL_P4_WILLAMETTE && c->x86_stepping == 1) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); @@ -518,27 +445,20 @@ static void intel_workarounds(struct cpuinfo_x86 *c) * integrated APIC (see 11AP erratum in "Pentium Processor * Specification Update"). */ - if (boot_cpu_has(X86_FEATURE_APIC) && (c->x86<<8 | c->x86_model<<4) == 0x520 && + if (boot_cpu_has(X86_FEATURE_APIC) && c->x86_vfm == INTEL_PENTIUM_75 && (c->x86_stepping < 0x6 || c->x86_stepping == 0xb)) set_cpu_bug(c, X86_BUG_11AP); - #ifdef CONFIG_X86_INTEL_USERCOPY /* - * Set up the preferred alignment for movsl bulk memory moves + * MOVSL bulk memory moves can be slow when source and dest are not + * both 8-byte aligned. PII/PIII only like MOVSL with 8-byte alignment. + * + * Set the preferred alignment for Pentium Pro and newer processors, as + * it has only been tested on these. */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ - movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ + if (c->x86_vfm >= INTEL_PENTIUM_PRO) movsl_mask.mask = 7; - break; - } #endif intel_smp_check(c); @@ -570,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; - if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) { if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); } @@ -580,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) { u64 msr; - if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr)) return; /* Clear all MISC features */ @@ -591,11 +511,27 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) probe_xeon_phi_r3mwait(c); msr = this_cpu_read(msr_misc_features_shadow); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msr); } -static void split_lock_init(void); -static void bus_lock_init(void); +/* + * This is a list of Intel CPUs that are known to suffer from downclocking when + * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel + * executes SIMD-optimized code such as cryptography functions or CRCs, it + * should prefer 256-bit (YMM) code to 512-bit (ZMM) code. + */ +static const struct x86_cpu_id zmm_exclusion_list[] = { + X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_ICELAKE_D, 0), + X86_MATCH_VFM(INTEL_ICELAKE, 0), + X86_MATCH_VFM(INTEL_ICELAKE_L, 0), + X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0), + X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0), + X86_MATCH_VFM(INTEL_TIGERLAKE, 0), + /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ + {}, +}; static void init_intel(struct cpuinfo_x86 *c) { @@ -625,19 +561,20 @@ static void init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) && - (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + if (boot_cpu_has(X86_FEATURE_CLFLUSH) && + (c->x86_vfm == INTEL_CORE2_DUNNINGTON || + c->x86_vfm == INTEL_NEHALEM_EX || + c->x86_vfm == INTEL_WESTMERE_EX)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); - if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && - ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + if (boot_cpu_has(X86_FEATURE_MWAIT) && + (c->x86_vfm == INTEL_ATOM_GOLDMONT || + c->x86_vfm == INTEL_LUNARLAKE_M)) set_cpu_bug(c, X86_BUG_MONITOR); #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); #else /* * Names for the Pentium II/Celeron processors @@ -672,13 +609,11 @@ static void init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); } - - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); #endif + if (x86_match_cpu(zmm_exclusion_list)) + set_cpu_cap(c, X86_FEATURE_PREFER_YMM); + /* Work around errata */ srat_detect_node(c); @@ -687,7 +622,6 @@ static void init_intel(struct cpuinfo_x86 *c) init_intel_misc_features(c); split_lock_init(); - bus_lock_init(); intel_init_thermal(c); } @@ -701,191 +635,90 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) * to determine which, so we use a boottime override * for the 512kb model, and assume 256 otherwise. */ - if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + if (c->x86_vfm == INTEL_PENTIUM_III_TUALATIN && size == 0) size = 256; /* * Intel Quark SoC X1000 contains a 4-way set associative * 16K cache with a 16 byte cache line and 256 lines per tag */ - if ((c->x86 == 5) && (c->x86_model == 9)) + if (c->x86_vfm == INTEL_QUARK_X1000) size = 16; return size; } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 - -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 - -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 - -#define TLB_DATA_1G 0x16 - -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 - -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 - -static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, - { 0x00, 0, 0 } -}; - -static void intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const struct leaf_0x2_table *desc) { - unsigned char k; - if (desc == 0) - return; + short entries = desc->entries; - /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && - intel_tlb_table[k].descriptor != 0; k++) - ; - - if (intel_tlb_table[k].tlb_type == 0) - return; - - switch (intel_tlb_table[k].tlb_type) { + switch (desc->t_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; + case TLB_DATA_1G_2M_4M: + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); + fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } static void intel_detect_tlb(struct cpuinfo_x86 *c) { - int i, j, n; - unsigned int regs[4]; - unsigned char *desc = (unsigned char *)regs; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; if (c->cpuid_level < 2) return; - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) - intel_tlb_lookup(desc[j]); - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) + intel_tlb_lookup(desc); } static const struct cpu_dev intel_cpu_dev = { @@ -952,394 +785,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#undef pr_fmt -#define pr_fmt(fmt) "x86/split lock detection: " fmt - -static const struct { - const char *option; - enum split_lock_detect_state state; -} sld_options[] __initconst = { - { "off", sld_off }, - { "warn", sld_warn }, - { "fatal", sld_fatal }, - { "ratelimit:", sld_ratelimit }, -}; - -static struct ratelimit_state bld_ratelimit; - -static unsigned int sysctl_sld_mitigate = 1; -static DEFINE_SEMAPHORE(buslock_sem, 1); - -#ifdef CONFIG_PROC_SYSCTL -static struct ctl_table sld_sysctls[] = { - { - .procname = "split_lock_mitigate", - .data = &sysctl_sld_mitigate, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init sld_mitigate_sysctl_init(void) -{ - register_sysctl_init("kernel", sld_sysctls); - return 0; -} - -late_initcall(sld_mitigate_sysctl_init); -#endif - -static inline bool match_option(const char *arg, int arglen, const char *opt) -{ - int len = strlen(opt), ratelimit; - - if (strncmp(arg, opt, len)) - return false; - - /* - * Min ratelimit is 1 bus lock/sec. - * Max ratelimit is 1000 bus locks/sec. - */ - if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 && - ratelimit > 0 && ratelimit <= 1000) { - ratelimit_state_init(&bld_ratelimit, HZ, ratelimit); - ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE); - return true; - } - - return len == arglen; -} - -static bool split_lock_verify_msr(bool on) -{ - u64 ctrl, tmp; - - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) - return false; - if (on) - ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - else - ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) - return false; - rdmsrl(MSR_TEST_CTRL, tmp); - return ctrl == tmp; -} - -static void __init sld_state_setup(void) -{ - enum split_lock_detect_state state = sld_warn; - char arg[20]; - int i, ret; - - if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - ret = cmdline_find_option(boot_command_line, "split_lock_detect", - arg, sizeof(arg)); - if (ret >= 0) { - for (i = 0; i < ARRAY_SIZE(sld_options); i++) { - if (match_option(arg, ret, sld_options[i].option)) { - state = sld_options[i].state; - break; - } - } - } - sld_state = state; -} - -static void __init __split_lock_setup(void) -{ - if (!split_lock_verify_msr(false)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - if (!split_lock_verify_msr(true)) { - pr_info("MSR access failed: Disabled\n"); - return; - } - - /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); - - setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); -} - -/* - * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking - * is not implemented as one thread could undo the setting of the other - * thread immediately after dropping the lock anyway. - */ -static void sld_update_msr(bool on) -{ - u64 test_ctrl_val = msr_test_ctrl_cache; - - if (on) - test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); -} - -static void split_lock_init(void) -{ - /* - * #DB for bus lock handles ratelimit and #AC for split lock is - * disabled. - */ - if (sld_state == sld_ratelimit) { - split_lock_verify_msr(false); - return; - } - - if (cpu_model_supports_sld) - split_lock_verify_msr(sld_state != sld_off); -} - -static void __split_lock_reenable_unlock(struct work_struct *work) -{ - sld_update_msr(true); - up(&buslock_sem); -} - -static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock); - -static void __split_lock_reenable(struct work_struct *work) -{ - sld_update_msr(true); -} -static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); - -/* - * If a CPU goes offline with pending delayed work to re-enable split lock - * detection then the delayed work will be executed on some other CPU. That - * handles releasing the buslock_sem, but because it executes on a - * different CPU probably won't re-enable split lock detection. This is a - * problem on HT systems since the sibling CPU on the same core may then be - * left running with split lock detection disabled. - * - * Unconditionally re-enable detection here. - */ -static int splitlock_cpu_offline(unsigned int cpu) -{ - sld_update_msr(true); - - return 0; -} - -static void split_lock_warn(unsigned long ip) -{ - struct delayed_work *work; - int cpu; - - if (!current->reported_split_lock) - pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n", - current->comm, current->pid, ip); - current->reported_split_lock = 1; - - if (sysctl_sld_mitigate) { - /* - * misery factor #1: - * sleep 10ms before trying to execute split lock. - */ - if (msleep_interruptible(10) > 0) - return; - /* - * Misery factor #2: - * only allow one buslocked disabled core at a time. - */ - if (down_interruptible(&buslock_sem) == -EINTR) - return; - work = &sl_reenable_unlock; - } else { - work = &sl_reenable; - } - - cpu = get_cpu(); - schedule_delayed_work_on(cpu, work, 2); - - /* Disable split lock detection on this CPU to make progress */ - sld_update_msr(false); - put_cpu(); -} - -bool handle_guest_split_lock(unsigned long ip) -{ - if (sld_state == sld_warn) { - split_lock_warn(ip); - return true; - } - - pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n", - current->comm, current->pid, - sld_state == sld_fatal ? "fatal" : "bogus", ip); - - current->thread.error_code = 0; - current->thread.trap_nr = X86_TRAP_AC; - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - return false; -} -EXPORT_SYMBOL_GPL(handle_guest_split_lock); - -static void bus_lock_init(void) -{ - u64 val; - - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - return; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); - - if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && - (sld_state == sld_warn || sld_state == sld_fatal)) || - sld_state == sld_off) { - /* - * Warn and fatal are handled by #AC for split lock if #AC for - * split lock is supported. - */ - val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; - } else { - val |= DEBUGCTLMSR_BUS_LOCK_DETECT; - } - - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); -} - -bool handle_user_split_lock(struct pt_regs *regs, long error_code) -{ - if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal) - return false; - split_lock_warn(regs->ip); - return true; -} - -void handle_bus_lock(struct pt_regs *regs) -{ - switch (sld_state) { - case sld_off: - break; - case sld_ratelimit: - /* Enforce no more than bld_ratelimit bus locks/sec. */ - while (!__ratelimit(&bld_ratelimit)) - msleep(20); - /* Warn on the bus lock. */ - fallthrough; - case sld_warn: - pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n", - current->comm, current->pid, regs->ip); - break; - case sld_fatal: - force_sig_fault(SIGBUS, BUS_ADRALN, NULL); - break; - } -} - -/* - * CPU models that are known to have the per-core split-lock detection - * feature even though they do not enumerate IA32_CORE_CAPABILITIES. - */ -static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0), - X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0), - {} -}; - -static void __init split_lock_setup(struct cpuinfo_x86 *c) -{ - const struct x86_cpu_id *m; - u64 ia32_core_caps; - - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return; - - /* Check for CPUs that have support but do not enumerate it: */ - m = x86_match_cpu(split_lock_cpu_ids); - if (m) - goto supported; - - if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) - return; - - /* - * Not all bits in MSR_IA32_CORE_CAPS are architectural, but - * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set - * it have split lock detection. - */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); - if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) - goto supported; - - /* CPU is not in the model list and does not have the MSR bit: */ - return; - -supported: - cpu_model_supports_sld = true; - __split_lock_setup(); -} - -static void sld_state_show(void) -{ - if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) - return; - - switch (sld_state) { - case sld_off: - pr_info("disabled\n"); - break; - case sld_warn: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n"); - if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, - "x86/splitlock", NULL, splitlock_cpu_offline) < 0) - pr_warn("No splitlock CPU offline handler\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: warning on user-space bus_locks\n"); - } - break; - case sld_fatal: - if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) { - pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n"); - } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) { - pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n", - boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ? - " from non-WB" : ""); - } - break; - case sld_ratelimit: - if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) - pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst); - break; - } -} - -void __init sld_setup(struct cpuinfo_x86 *c) -{ - split_lock_setup(c); - sld_state_setup(); - sld_state_show(); -} - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index f18d35fe27a9..bc7671f920a7 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -79,7 +79,7 @@ static int intel_epb_save(void) { u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); /* * Ensure that saved_epb will always be nonzero after this write even if * the EPB value read from the MSR is 0. @@ -94,7 +94,7 @@ static void intel_epb_restore(void) u64 val = this_cpu_read(saved_epb); u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { @@ -111,7 +111,7 @@ static void intel_epb_restore(void) pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); + wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } static struct syscore_ops intel_epb_syscore_ops = { @@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev, u64 epb; int ret; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; @@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev, else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) return -EINVAL; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; - ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); if (ret < 0) return ret; @@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu) } static const struct x86_cpu_id intel_epb_normal[] = { - X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), - X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, - ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ALDERLAKE_L, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), + X86_MATCH_VFM(INTEL_RAPTORLAKE_P, + ENERGY_PERF_BIAS_NORMAL_POWERSAVE), {} }; diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c deleted file mode 100644 index 5be2b1790282..000000000000 --- a/arch/x86/kernel/cpu/intel_pconfig.c +++ /dev/null @@ -1,84 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Intel PCONFIG instruction support. - * - * Copyright (C) 2017 Intel Corporation - * - * Author: - * Kirill A. Shutemov <kirill.shutemov@linux.intel.com> - */ -#include <linux/bug.h> -#include <linux/limits.h> - -#include <asm/cpufeature.h> -#include <asm/intel_pconfig.h> - -#define PCONFIG_CPUID 0x1b - -#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1) - -/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */ -enum { - PCONFIG_CPUID_SUBLEAF_INVALID = 0, - PCONFIG_CPUID_SUBLEAF_TARGETID = 1, -}; - -/* Bitmask of supported targets */ -static u64 targets_supported __read_mostly; - -int pconfig_target_supported(enum pconfig_target target) -{ - /* - * We would need to re-think the implementation once we get > 64 - * PCONFIG targets. Spec allows up to 2^32 targets. - */ - BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64); - - if (WARN_ON_ONCE(target >= 64)) - return 0; - return targets_supported & (1ULL << target); -} - -static int __init intel_pconfig_init(void) -{ - int subleaf; - - if (!boot_cpu_has(X86_FEATURE_PCONFIG)) - return 0; - - /* - * Scan subleafs of PCONFIG CPUID leaf. - * - * Subleafs of the same type need not to be consecutive. - * - * Stop on the first invalid subleaf type. All subleafs after the first - * invalid are invalid too. - */ - for (subleaf = 0; subleaf < INT_MAX; subleaf++) { - struct cpuid_regs regs; - - cpuid_count(PCONFIG_CPUID, subleaf, - ®s.eax, ®s.ebx, ®s.ecx, ®s.edx); - - switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) { - case PCONFIG_CPUID_SUBLEAF_INVALID: - /* Stop on the first invalid subleaf */ - goto out; - case PCONFIG_CPUID_SUBLEAF_TARGETID: - /* Mark supported PCONFIG targets */ - if (regs.ebx < 64) - targets_supported |= (1ULL << regs.ebx); - if (regs.ecx < 64) - targets_supported |= (1ULL << regs.ecx); - if (regs.edx < 64) - targets_supported |= (1ULL << regs.edx); - break; - default: - /* Unknown CPUID.PCONFIG subleaf: ignore */ - break; - } - } -out: - return 0; -} -arch_initcall(intel_pconfig_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index ad6776081e60..6af1e8baeb0f 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -6,7 +6,35 @@ #include <linux/slab.h> /** - * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * x86_match_vendor_cpu_type - helper function to match the hardware defined + * cpu-type for a single entry in the x86_cpu_id + * table. Note, this function does not match the + * generic cpu-types TOPO_CPU_TYPE_EFFICIENCY and + * TOPO_CPU_TYPE_PERFORMANCE. + * @c: Pointer to the cpuinfo_x86 structure of the CPU to match. + * @m: Pointer to the x86_cpu_id entry to match against. + * + * Return: true if the cpu-type matches, false otherwise. + */ +static bool x86_match_vendor_cpu_type(struct cpuinfo_x86 *c, const struct x86_cpu_id *m) +{ + if (m->type == X86_CPU_TYPE_ANY) + return true; + + /* Hybrid CPUs are special, they are assumed to match all cpu-types */ + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) + return true; + + if (c->x86_vendor == X86_VENDOR_INTEL) + return m->type == c->topo.intel_type; + if (c->x86_vendor == X86_VENDOR_AMD) + return m->type == c->topo.amd_type; + + return false; +} + +/** + * x86_match_cpu - match current CPU against an array of x86_cpu_ids * @match: Pointer to array of x86_cpu_ids. Last entry terminated with * {}. * @@ -17,8 +45,7 @@ * * A typical table entry would be to match a specific CPU * - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL, - * X86_FEATURE_ANY, NULL); + * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL); * * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor) @@ -26,7 +53,7 @@ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts * for various common selections. The above can be shortened to: * - * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL); + * X86_MATCH_VFM(INTEL_BROADWELL, NULL); * * Arrays used to match for this should also be declared using * MODULE_DEVICE_TABLE(x86cpu, ...) @@ -39,9 +66,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; - m->vendor | m->family | m->model | m->steppings | m->feature; - m++) { + for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) @@ -53,39 +78,21 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; - return m; - } - return NULL; -} -EXPORT_SYMBOL(x86_match_cpu); - -static const struct x86_cpu_desc * -x86_match_cpu_with_stepping(const struct x86_cpu_desc *match) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - const struct x86_cpu_desc *m; - - for (m = match; m->x86_family | m->x86_model; m++) { - if (c->x86_vendor != m->x86_vendor) - continue; - if (c->x86 != m->x86_family) - continue; - if (c->x86_model != m->x86_model) - continue; - if (c->x86_stepping != m->x86_stepping) + if (!x86_match_vendor_cpu_type(c, m)) continue; return m; } return NULL; } +EXPORT_SYMBOL(x86_match_cpu); -bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table) +bool x86_match_min_microcode_rev(const struct x86_cpu_id *table) { - const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table); + const struct x86_cpu_id *res = x86_match_cpu(table); - if (!res || res->x86_microcode_rev > boot_cpu_data.microcode) + if (!res || res->driver_data > boot_cpu_data.microcode) return false; return true; } -EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev); +EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 9a0133ef7e20..9d852c3b2cb5 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -4,8 +4,6 @@ * * Written by Jacob Shin - AMD, Inc. * Maintained by: Borislav Petkov <bp@alien8.de> - * - * All MC4_MISCi registers are shared between cores on a node. */ #include <linux/interrupt.h> #include <linux/notifier.h> @@ -20,7 +18,6 @@ #include <linux/smp.h> #include <linux/string.h> -#include <asm/amd_nb.h> #include <asm/traps.h> #include <asm/apic.h> #include <asm/mce.h> @@ -221,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = { #define MAX_MCATYPE_NAME_LEN 30 static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; +struct threshold_block { + /* This block's number within its bank. */ + unsigned int block; + /* MCA bank number that contains this block. */ + unsigned int bank; + /* CPU which controls this block's MCA bank. */ + unsigned int cpu; + /* MCA_MISC MSR address for this block. */ + u32 address; + /* Enable/Disable APIC interrupt. */ + bool interrupt_enable; + /* Bank can generate an interrupt. */ + bool interrupt_capable; + /* Value upon which threshold interrupt is generated. */ + u16 threshold_limit; + /* sysfs object */ + struct kobject kobj; + /* List of threshold blocks within this block's MCA bank. */ + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; +}; + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); /* @@ -333,19 +356,6 @@ struct thresh_restart { u16 old_limit; }; -static inline bool is_shared_bank(int bank) -{ - /* - * Scalable MCA provides for only one core to have access to the MSRs of - * a shared bank. - */ - if (mce_flags.smca) - return false; - - /* Bank 4 is for northbridge reporting and is thus shared */ - return (bank == 4); -} - static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { @@ -381,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) return msr_high_bits & BIT(28); } -static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) { int msr = (hi & MASK_LVTOFF_HI) >> 20; @@ -389,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } if (apic != msr) { @@ -399,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) * was set is reserved. Return early here: */ if (mce_flags.smca) - return 0; + return false; pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, apic, b->bank, b->block, b->address, hi, lo); - return 0; + return false; } - return 1; + return true; }; /* Reprogram MCx_MISC MSR behind this threshold bank. */ @@ -652,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) return; } - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrq(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + wrmsrq(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ for (i = 0; i < num_msrs; i++) @@ -665,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* restore old settings */ if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); + wrmsrq(MSR_K7_HWCR, hwcr); } /* cpu init entry point, called from mce.c with preempt off */ @@ -778,29 +788,33 @@ bool amd_mce_usable_address(struct mce *m) static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) { - struct mce m; + struct mce_hw_err err; + struct mce *m = &err.m; - mce_setup(&m); + mce_prep_record(&err); - m.status = status; - m.misc = misc; - m.bank = bank; - m.tsc = rdtsc(); + m->status = status; + m->misc = misc; + m->bank = bank; + m->tsc = rdtsc(); - if (m.status & MCI_STATUS_ADDRV) { - m.addr = addr; + if (m->status & MCI_STATUS_ADDRV) { + m->addr = addr; - smca_extract_err_addr(&m); + smca_extract_err_addr(m); } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); - if (m.status & MCI_STATUS_SYNDV) - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + if (m->status & MCI_STATUS_SYNDV) { + rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + } } - mce_log(&m); + mce_log(&err); } DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error) @@ -820,16 +834,16 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { u64 status, addr = 0; - rdmsrl(msr_stat, status); + rdmsrq(msr_stat, status); if (!(status & MCI_STATUS_VAL)) return false; if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); + rdmsrq(msr_addr, addr); __log_error(bank, status, addr, misc); - wrmsrl(msr_stat, 0); + wrmsrq(msr_stat, 0); return status & MCI_STATUS_DEFERRED; } @@ -848,7 +862,7 @@ static bool _log_error_deferred(unsigned int bank, u32 misc) return true; /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); return true; } @@ -1194,35 +1208,10 @@ out_free: return err; } -static int __threshold_add_blocks(struct threshold_bank *b) -{ - struct list_head *head = &b->blocks->miscj; - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - int err = 0; - - err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); - if (err) - return err; - - list_for_each_entry_safe(pos, tmp, head, miscj) { - - err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); - if (err) { - list_for_each_entry_safe_reverse(pos, tmp, head, miscj) - kobject_del(&pos->kobj); - - return err; - } - } - return err; -} - static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, unsigned int bank) { struct device *dev = this_cpu_read(mce_device); - struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; const char *name = get_name(cpu, bank, NULL); int err = 0; @@ -1230,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, if (!dev) return -ENODEV; - if (is_shared_bank(bank)) { - nb = node_to_amd_nb(topology_amd_node_id(cpu)); - - /* threshold descriptor already initialized on this node? */ - if (nb && nb->bank4) { - /* yes, use it */ - b = nb->bank4; - err = kobject_add(b->kobj, &dev->kobj, name); - if (err) - goto out; - - bp[bank] = b; - refcount_inc(&b->cpus); - - err = __threshold_add_blocks(b); - - goto out; - } - } - b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; @@ -1263,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu, goto out_free; } - if (is_shared_bank(bank)) { - b->shared = 1; - refcount_set(&b->cpus, 1); - - /* nb is already initialized, see above */ - if (nb) { - WARN_ON(nb->bank4); - nb->bank4 = b; - } - } - err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC)); if (err) goto out_kobj; @@ -1306,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank) kobject_put(&bank->blocks->kobj); } -static void __threshold_remove_blocks(struct threshold_bank *b) -{ - struct threshold_block *pos = NULL; - struct threshold_block *tmp = NULL; - - kobject_put(b->kobj); - - list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) - kobject_put(b->kobj); -} - static void threshold_remove_bank(struct threshold_bank *bank) { - struct amd_northbridge *nb; - if (!bank->blocks) goto out_free; - if (!bank->shared) - goto out_dealloc; - - if (!refcount_dec_and_test(&bank->cpus)) { - __threshold_remove_blocks(bank); - return; - } else { - /* - * The last CPU on this node using the shared bank is going - * away, remove that bank now. - */ - nb = node_to_amd_nb(topology_amd_node_id(smp_processor_id())); - nb->bank4 = NULL; - } - -out_dealloc: deallocate_threshold_blocks(bank); out_free: diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 7f7309ff67d0..0a89947e47bc 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -28,7 +28,8 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { - struct mce m; + struct mce_hw_err err; + struct mce *m; int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) @@ -44,30 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) else lsb = PAGE_SHIFT; - mce_setup(&m); - m.bank = -1; + mce_prep_record(&err); + m = &err.m; + m->bank = -1; /* Fake a memory read error with unknown channel */ - m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; + m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; + m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) - m.status |= MCI_STATUS_UC; + m->status |= MCI_STATUS_UC; if (severity >= GHES_SEV_PANIC) { - m.status |= MCI_STATUS_PCC; - m.tsc = rdtsc(); + m->status |= MCI_STATUS_PCC; + m->tsc = rdtsc(); } - m.addr = mem_err->physical_addr; - mce_log(&m); + m->addr = mem_err->physical_addr; + mce_log(&err); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) { const u64 *i_mce = ((const u64 *) (ctx_info + 1)); - unsigned int cpu; - struct mce m; + unsigned int cpu, num_regs; + bool apicid_found = false; + struct mce_hw_err err; + struct mce *m; if (!boot_cpu_has(X86_FEATURE_SMCA)) return -EINVAL; @@ -85,41 +89,86 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) return -EINVAL; /* - * The register array size must be large enough to include all the - * SMCA registers which need to be extracted. - * * The number of registers in the register array is determined by * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. - * The register layout is fixed and currently the raw data in the - * register array includes 6 SMCA registers which the kernel can - * extract. + * Sanity-check registers array size. */ - if (ctx_info->reg_arr_size < 48) + num_regs = ctx_info->reg_arr_size >> 3; + if (!num_regs) return -EINVAL; - mce_setup(&m); - - m.extcpu = -1; - m.socketid = -1; - for_each_possible_cpu(cpu) { if (cpu_data(cpu).topo.initial_apicid == lapic_id) { - m.extcpu = cpu; - m.socketid = cpu_data(m.extcpu).topo.pkg_id; + apicid_found = true; break; } } - m.apicid = lapic_id; - m.bank = (ctx_info->msr_addr >> 4) & 0xFF; - m.status = *i_mce; - m.addr = *(i_mce + 1); - m.misc = *(i_mce + 2); - /* Skipping MCA_CONFIG */ - m.ipid = *(i_mce + 4); - m.synd = *(i_mce + 5); + if (!apicid_found) + return -EINVAL; + + m = &err.m; + memset(&err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(cpu, m); + + m->bank = (ctx_info->msr_addr >> 4) & 0xFF; + + /* + * The SMCA register layout is fixed and includes 16 registers. + * The end of the array may be variable, but the beginning is known. + * Cap the number of registers to expected max (15). + */ + if (num_regs > 15) + num_regs = 15; + + switch (num_regs) { + /* MCA_SYND2 */ + case 15: + err.vendor.amd.synd2 = *(i_mce + 14); + fallthrough; + /* MCA_SYND1 */ + case 14: + err.vendor.amd.synd1 = *(i_mce + 13); + fallthrough; + /* MCA_MISC4 */ + case 13: + /* MCA_MISC3 */ + case 12: + /* MCA_MISC2 */ + case 11: + /* MCA_MISC1 */ + case 10: + /* MCA_DEADDR */ + case 9: + /* MCA_DESTAT */ + case 8: + /* reserved */ + case 7: + /* MCA_SYND */ + case 6: + m->synd = *(i_mce + 5); + fallthrough; + /* MCA_IPID */ + case 5: + m->ipid = *(i_mce + 4); + fallthrough; + /* MCA_CONFIG */ + case 4: + /* MCA_MISC0 */ + case 3: + m->misc = *(i_mce + 2); + fallthrough; + /* MCA_ADDR */ + case 2: + m->addr = *(i_mce + 1); + fallthrough; + /* MCA_STATUS */ + case 1: + m->status = *i_mce; + } - mce_log(&m); + mce_log(&err); return 0; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b5cc557cfc37..e9b3c5d4a52e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -47,7 +47,7 @@ #include <linux/kexec.h> #include <asm/fred.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -88,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = { .monarch_timeout = -1 }; -static DEFINE_PER_CPU(struct mce, mces_seen); +static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen); static unsigned long mce_need_notify; /* @@ -117,28 +117,41 @@ static struct irq_work mce_irq_work; */ BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain); -/* Do initial initialization of a struct mce */ -void mce_setup(struct mce *m) +void mce_prep_record_common(struct mce *m) { - memset(m, 0, sizeof(struct mce)); - m->cpu = m->extcpu = smp_processor_id(); + m->cpuid = cpuid_eax(1); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ - m->time = __ktime_get_real_seconds(); - m->cpuvendor = boot_cpu_data.x86_vendor; - m->cpuid = cpuid_eax(1); - m->socketid = cpu_data(m->extcpu).topo.pkg_id; - m->apicid = cpu_data(m->extcpu).topo.initial_apicid; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); - m->ppin = cpu_data(m->extcpu).ppin; - m->microcode = boot_cpu_data.microcode; + m->time = __ktime_get_real_seconds(); +} + +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m) +{ + m->cpu = cpu; + m->extcpu = cpu; + m->apicid = cpu_data(cpu).topo.initial_apicid; + m->microcode = cpu_data(cpu).microcode; + m->ppin = topology_ppin(cpu); + m->socketid = topology_physical_package_id(cpu); +} + +/* Do initial initialization of struct mce_hw_err */ +void mce_prep_record(struct mce_hw_err *err) +{ + struct mce *m = &err->m; + + memset(err, 0, sizeof(struct mce_hw_err)); + mce_prep_record_common(m); + mce_prep_record_per_cpu(smp_processor_id(), m); } DEFINE_PER_CPU(struct mce, injectm); EXPORT_PER_CPU_SYMBOL_GPL(injectm); -void mce_log(struct mce *m) +void mce_log(struct mce_hw_err *err) { - if (!mce_gen_pool_add(m)) + if (mce_gen_pool_add(err)) irq_work_queue(&mce_irq_work); } EXPORT_SYMBOL_GPL(mce_log); @@ -159,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); -static void __print_mce(struct mce *m) +static void __print_mce(struct mce_hw_err *err) { + struct mce *m = &err->m; + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", m->extcpu, (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), @@ -187,6 +202,10 @@ static void __print_mce(struct mce *m) if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); + if (err->vendor.amd.synd1) + pr_cont("SYND1 %llx ", err->vendor.amd.synd1); + if (err->vendor.amd.synd2) + pr_cont("SYND2 %llx ", err->vendor.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); } @@ -202,9 +221,11 @@ static void __print_mce(struct mce *m) m->microcode); } -static void print_mce(struct mce *m) +static void print_mce(struct mce_hw_err *err) { - __print_mce(m); + struct mce *m = &err->m; + + __print_mce(err); if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON) pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); @@ -239,7 +260,7 @@ static const char *mce_dump_aux_info(struct mce *m) return NULL; } -static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) +static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp) { struct llist_node *pending; struct mce_evt_llist *l; @@ -270,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) pending = mce_gen_pool_prepare_records(); /* First print corrected ones that are still unlogged */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) { - print_mce(m); + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } } /* Now print uncorrected but with the final one last */ llist_for_each_entry(l, pending, llnode) { - struct mce *m = &l->mce; + struct mce_hw_err *err = &l->err; + struct mce *m = &err->m; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || mce_cmp(m, final)) { - print_mce(m); + if (!final || mce_cmp(m, &final->m)) { + print_mce(err); if (!apei_err) apei_err = apei_write_mce(m); } @@ -291,12 +314,12 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) if (final) { print_mce(final); if (!apei_err) - apei_err = apei_write_mce(final); + apei_err = apei_write_mce(&final->m); } if (exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); - memmsg = mce_dump_aux_info(final); + memmsg = mce_dump_aux_info(&final->m); if (memmsg) pr_emerg(HW_ERR "Machine check: %s\n", memmsg); @@ -311,9 +334,9 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp) * panic. */ if (kexec_crash_loaded()) { - if (final && (final->status & MCI_STATUS_ADDRV)) { + if (final && (final->m.status & MCI_STATUS_ADDRV)) { struct page *p; - p = pfn_to_online_page(final->addr >> PAGE_SHIFT); + p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT); if (p) SetPageHWPoison(p); } @@ -365,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -400,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -421,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -433,17 +456,19 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) * check into our "mce" struct so that we can use it later to assess * the severity of the problem as we read per-bank specific details. */ -static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) +static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs) { + struct mce *m; /* - * Enable instrumentation around mce_setup() which calls external + * Enable instrumentation around mce_prep_record() which calls external * facilities. */ instrumentation_begin(); - mce_setup(m); + mce_prep_record(err); instrumentation_end(); - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m = &err->m; + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -463,14 +488,14 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs) } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } -int mce_available(struct cpuinfo_x86 *c) +bool mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) - return 0; + return false; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } @@ -559,16 +584,38 @@ bool mce_is_correctable(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_correctable); +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +static bool mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + mce_work_trigger(); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return true; + } + + return false; +} + static int mce_early_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; /* Emit the trace record: */ - trace_mce_record(m); + trace_mce_record(err); set_bit(0, &mce_need_notify); @@ -612,13 +659,13 @@ static struct notifier_block mce_uc_nb = { static int mce_default_notifier(struct notifier_block *nb, unsigned long val, void *data) { - struct mce *m = (struct mce *)data; + struct mce_hw_err *err = to_mce_hw_err(data); - if (!m) + if (!err) return NOTIFY_DONE; - if (mca_cfg.print_all || !m->kflags) - __print_mce(m); + if (mca_cfg.print_all || !(err->m.kflags)) + __print_mce(err); return NOTIFY_DONE; } @@ -632,13 +679,15 @@ static struct notifier_block mce_default_nb = { /* * Read ADDR and MISC registers. */ -static noinstr void mce_read_aux(struct mce *m, int i) +static noinstr void mce_read_aux(struct mce_hw_err *err, int i) { + struct mce *m = &err->m; + if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -653,10 +702,13 @@ static noinstr void mce_read_aux(struct mce *m, int i) } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); - if (m->status & MCI_STATUS_SYNDV) - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); + if (m->status & MCI_STATUS_SYNDV) { + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); + } } } @@ -677,30 +729,31 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - bool error_seen = false; - struct mce m; + struct mce_hw_err err; + struct mce *m; int i; this_cpu_inc(mce_poll_count); - mce_gather_info(&m, NULL); + mce_gather_info(&err, NULL); + m = &err.m; if (flags & MCP_TIMESTAMP) - m.tsc = rdtsc(); + m->tsc = rdtsc(); for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; - m.misc = 0; - m.addr = 0; - m.bank = i; + m->misc = 0; + m->addr = 0; + m->bank = i; barrier(); - m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -710,17 +763,17 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * storm status. */ if (!mca_cfg.cmci_disabled) - mce_track_storm(&m); + mce_track_storm(m); /* If this entry is not valid, ignore it */ - if (!(m.status & MCI_STATUS_VAL)) + if (!(m->status & MCI_STATUS_VAL)) continue; /* * If we are logging everything (at CPU online) or this * is a corrected error, then we must log it. */ - if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC)) goto log_it; /* @@ -730,20 +783,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * everything else. */ if (!mca_cfg.ser) { - if (m.status & MCI_STATUS_UC) + if (m->status & MCI_STATUS_UC) continue; goto log_it; } /* Log "not enabled" (speculative) errors */ - if (!(m.status & MCI_STATUS_EN)) + if (!(m->status & MCI_STATUS_EN)) goto log_it; /* * Log UCNA (SDM: 15.6.3 "UCR Error Classification") * UC == 1 && PCC == 0 && S == 0 */ - if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S)) goto log_it; /* @@ -754,31 +807,29 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; log_it: - error_seen = true; - if (flags & MCP_DONTLOG) goto clear_it; - mce_read_aux(&m, i); - m.severity = mce_severity(&m, NULL, NULL, false); + mce_read_aux(&err, i); + m->severity = mce_severity(m, NULL, NULL, false); /* * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (mca_cfg.dont_log_ce && !mce_usable_address(&m)) + if (mca_cfg.dont_log_ce && !mce_usable_address(m)) goto clear_it; if (flags & MCP_QUEUE_LOG) - mce_gen_pool_add(&m); + mce_gen_pool_add(&err); else - mce_log(&m); + mce_log(&err); clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -787,8 +838,6 @@ clear_it: */ sync_core(); - - return error_seen; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -838,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -850,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -861,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -898,14 +947,15 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, +static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp, struct pt_regs *regs) { + struct mce *m = &err->m; char *tmp = *msg; int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -918,7 +968,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo m->bank = i; if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) { - mce_read_aux(m, i); + mce_read_aux(err, i); *msg = tmp; return 1; } @@ -1009,10 +1059,11 @@ out: */ static void mce_reign(void) { - int cpu; + struct mce_hw_err *err = NULL; struct mce *m = NULL; int global_worst = 0; char *msg = NULL; + int cpu; /* * This CPU is the Monarch and the other CPUs have run @@ -1020,11 +1071,13 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - struct mce *mtmp = &per_cpu(mces_seen, cpu); + struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu); + struct mce *mtmp = &etmp->m; if (mtmp->severity > global_worst) { global_worst = mtmp->severity; - m = &per_cpu(mces_seen, cpu); + err = &per_cpu(hw_errs_seen, cpu); + m = &err->m; } } @@ -1036,7 +1089,7 @@ static void mce_reign(void) if (m && global_worst >= MCE_PANIC_SEVERITY) { /* call mce_severity() to get "msg" for panic */ mce_severity(m, NULL, &msg, true); - mce_panic("Fatal machine check", m, msg); + mce_panic("Fatal machine check", err, msg); } /* @@ -1053,11 +1106,11 @@ static void mce_reign(void) mce_panic("Fatal machine check from unknown source", NULL, NULL); /* - * Now clear all the mces_seen so that they don't reappear on + * Now clear all the hw_errs_seen so that they don't reappear on * the next mce. */ for_each_possible_cpu(cpu) - memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); + memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err)); } static atomic_t global_nwo; @@ -1221,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1245,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1253,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1261,13 +1314,14 @@ static noinstr bool mce_check_crashing_cpu(void) } static __always_inline int -__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, - unsigned long *toclear, unsigned long *valid_banks, int no_way_out, - int *worst) +__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, + struct mce_hw_err *final, unsigned long *toclear, + unsigned long *valid_banks, int no_way_out, int *worst) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i, taint = 0; + struct mce *m = &err->m; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { arch___clear_bit(i, toclear); @@ -1281,7 +1335,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1312,7 +1366,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, if (severity == MCE_NO_SEVERITY) continue; - mce_read_aux(m, i); + mce_read_aux(err, i); /* assuming valid severity level != 0 */ m->severity = severity; @@ -1322,17 +1376,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final, * done in #MC context, where instrumentation is disabled. */ instrumentation_begin(); - mce_log(m); + mce_log(err); instrumentation_end(); if (severity > *worst) { - *final = *m; + *final = *err; *worst = severity; } } /* mce_clear_state will clear *final, save locally for use later */ - *m = *final; + *err = *final; return taint; } @@ -1392,9 +1446,10 @@ static void kill_me_never(struct callback_head *cb) set_mce_nospec(pfn); } -static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *)) +static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *)) { int count = ++current->mce_count; + struct mce *m = &err->m; /* First call, save all the details */ if (count == 1) { @@ -1407,11 +1462,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba /* Ten is likely overkill. Don't expect more than two faults before task_work() */ if (count > 10) - mce_panic("Too many consecutive machine checks while accessing user data", m, msg); + mce_panic("Too many consecutive machine checks while accessing user data", + err, msg); /* Second or later call, make sure page address matches the one from first call */ if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT)) - mce_panic("Consecutive machine checks to different user pages", m, msg); + mce_panic("Consecutive machine checks to different user pages", err, msg); /* Do not call task_work_add() more than once */ if (count > 1) @@ -1460,8 +1516,10 @@ noinstr void do_machine_check(struct pt_regs *regs) int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0; DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 }; DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 }; - struct mce m, *final; + struct mce_hw_err *final; + struct mce_hw_err err; char *msg = NULL; + struct mce *m; if (unlikely(mce_flags.p5)) return pentium_machine_check(regs); @@ -1499,13 +1557,14 @@ noinstr void do_machine_check(struct pt_regs *regs) this_cpu_inc(mce_exception_count); - mce_gather_info(&m, regs); - m.tsc = rdtsc(); + mce_gather_info(&err, regs); + m = &err.m; + m->tsc = rdtsc(); - final = this_cpu_ptr(&mces_seen); - *final = m; + final = this_cpu_ptr(&hw_errs_seen); + *final = err; - no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs); barrier(); @@ -1514,15 +1573,15 @@ noinstr void do_machine_check(struct pt_regs *regs) * Assume the worst for now, but if we find the * severity is MCE_AR_SEVERITY we have other options. */ - if (!(m.mcgstatus & MCG_STATUS_RIPV)) + if (!(m->mcgstatus & MCG_STATUS_RIPV)) kill_current_task = 1; /* * Check if this MCE is signaled to only this logical processor, * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL || - m.cpuvendor == X86_VENDOR_ZHAOXIN) - lmce = m.mcgstatus & MCG_STATUS_LMCES; + if (m->cpuvendor == X86_VENDOR_INTEL || + m->cpuvendor == X86_VENDOR_ZHAOXIN) + lmce = m->mcgstatus & MCG_STATUS_LMCES; /* * Local machine check may already know that we have to panic. @@ -1533,12 +1592,12 @@ noinstr void do_machine_check(struct pt_regs *regs) */ if (lmce) { if (no_way_out) - mce_panic("Fatal local machine check", &m, msg); + mce_panic("Fatal local machine check", &err, msg); } else { order = mce_start(&no_way_out); } - taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst); + taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst); if (!no_way_out) mce_clear_state(toclear); @@ -1553,7 +1612,7 @@ noinstr void do_machine_check(struct pt_regs *regs) no_way_out = worst >= MCE_PANIC_SEVERITY; if (no_way_out) - mce_panic("Fatal machine check on current CPU", &m, msg); + mce_panic("Fatal machine check on current CPU", &err, msg); } } else { /* @@ -1565,8 +1624,8 @@ noinstr void do_machine_check(struct pt_regs *regs) * make sure we have the right "msg". */ if (worst >= MCE_PANIC_SEVERITY) { - mce_severity(&m, regs, &msg, true); - mce_panic("Local fatal machine check!", &m, msg); + mce_severity(m, regs, &msg, true); + mce_panic("Local fatal machine check!", &err, msg); } } @@ -1584,15 +1643,33 @@ noinstr void do_machine_check(struct pt_regs *regs) goto out; /* Fault was in user mode and we need to take some action */ - if ((m.cs & 3) == 3) { + if ((m->cs & 3) == 3) { /* If this triggers there is no way to recover. Die hard. */ BUG_ON(!on_thread_stack() || !user_mode(regs)); - if (!mce_usable_address(&m)) - queue_task_work(&m, msg, kill_me_now); + if (!mce_usable_address(m)) + queue_task_work(&err, msg, kill_me_now); else - queue_task_work(&m, msg, kill_me_maybe); + queue_task_work(&err, msg, kill_me_maybe); + + } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) { + /* + * Saved RIP on stack makes it look like the machine check + * was taken in the kernel on the instruction following + * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates + * that the machine check was taken inside SEAM non-root + * mode. CPU core has already marked that guest as dead. + * It is OK for the kernel to resume execution at the + * apparent point of the machine check as the fault did + * not occur there. Mark the page as poisoned so it won't + * be added to free list when the guest is terminated. + */ + if (mce_usable_address(m)) { + struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT); + if (p) + SetPageHWPoison(p); + } } else { /* * Handle an MCE which has happened in kernel space but from @@ -1603,20 +1680,20 @@ noinstr void do_machine_check(struct pt_regs *regs) * corresponding exception handler which would do that is the * proper one. */ - if (m.kflags & MCE_IN_KERNEL_RECOV) { + if (m->kflags & MCE_IN_KERNEL_RECOV) { if (!fixup_exception(regs, X86_TRAP_MC, 0, 0)) - mce_panic("Failed kernel mode recovery", &m, msg); + mce_panic("Failed kernel mode recovery", &err, msg); } - if (m.kflags & MCE_IN_KERNEL_COPYIN) - queue_task_work(&m, msg, kill_me_never); + if (m->kflags & MCE_IN_KERNEL_COPYIN) + queue_task_work(&err, msg, kill_me_never); } out: instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1709,36 +1786,14 @@ void mce_timer_kick(bool storm) __this_cpu_write(mce_next_interval, check_interval * HZ); } -/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +/* Must not be called in IRQ context where timer_delete_sync() can deadlock */ static void mce_timer_delete_all(void) { int cpu; for_each_online_cpu(cpu) - del_timer_sync(&per_cpu(mce_timer, cpu)); -} - -/* - * Notify the user(s) about new machine check events. - * Can be called from interrupt context, but not from machine check/NMI - * context. - */ -int mce_notify_irq(void) -{ - /* Not more than two messages every minute */ - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - - if (test_and_clear_bit(0, &mce_need_notify)) { - mce_work_trigger(); - - if (__ratelimit(&ratelimit)) - pr_info(HW_ERR "Machine check events logged\n"); - - return 1; - } - return 0; + timer_delete_sync(&per_cpu(mce_timer, cpu)); } -EXPORT_SYMBOL_GPL(mce_notify_irq); static void __mcheck_cpu_mce_banks_init(void) { @@ -1767,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1808,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void) cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } @@ -1823,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1850,106 +1905,125 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } -/* Add per CPU specific workarounds here */ -static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static void apply_quirks_amd(struct cpuinfo_x86 *c) { struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - struct mca_config *cfg = &mca_cfg; - - if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("unknown CPU type - not enabling MCE support\n"); - return -EOPNOTSUPP; - } /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { - /* - * disable GART TBL walk error reporting, which - * trips off incorrectly with the IOMMU & 3ware - * & Cerberus: - */ - clear_bit(10, (unsigned long *)&mce_banks[4].ctl); - } - if (c->x86 < 0x11 && cfg->bootlog < 0) { - /* - * Lots of broken BIOS around that don't clear them - * by default and leave crap in there. Don't log: - */ - cfg->bootlog = 0; - } + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* - * Various K7s with broken bank 0 around. Always disable - * by default. + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: */ - if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].ctl = 0; + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 < 0x11 && mca_cfg.bootlog < 0) { /* - * overflow_recov is supported for F15h Models 00h-0fh - * even though we don't have a CPUID bit for it. + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: */ - if (c->x86 == 0x15 && c->x86_model <= 0xf) - mce_flags.overflow_recov = 1; + mca_cfg.bootlog = 0; + } - if (c->x86 >= 0x17 && c->x86 <= 0x1A) - mce_flags.zen_ifu_quirk = 1; + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && this_cpu_read(mce_num_banks)) + mce_banks[0].ctl = 0; - } + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; - if (c->x86_vendor == X86_VENDOR_INTEL) { - /* - * SDM documents that on family 6 bank 0 should not be written - * because it aliases to another special BIOS controlled - * register. - * But it's not aliased anymore on model 0x1a+ - * Don't ignore bank 0 completely because there could be a - * valid event later, merely don't write CTL0. - */ + if (c->x86 >= 0x17 && c->x86 <= 0x1A) + mce_flags.zen_ifu_quirk = 1; +} - if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) - mce_banks[0].init = false; +static void apply_quirks_intel(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); - /* - * All newer Intel systems support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; + /* Older CPUs (prior to family 6) don't need quirks. */ + if (c->x86_vfm < INTEL_PENTIUM_PRO) + return; - /* - * There are also broken BIOSes on some Pentium M and - * earlier systems: - */ - if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) - cfg->bootlog = 0; + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks)) + mce_banks[0].init = false; - if (c->x86 == 6 && c->x86_model == 45) - mce_flags.snb_ifu_quirk = 1; + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; - /* - * Skylake, Cascacde Lake and Cooper Lake require a quirk on - * rep movs. - */ - if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X) - mce_flags.skx_repmov_quirk = 1; + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0) + mca_cfg.bootlog = 0; + + if (c->x86_vfm == INTEL_SANDYBRIDGE_X) + mce_flags.snb_ifu_quirk = 1; + + /* + * Skylake, Cascacde Lake and Cooper Lake require a quirk on + * rep movs. + */ + if (c->x86_vfm == INTEL_SKYLAKE_X) + mce_flags.skx_repmov_quirk = 1; +} + +static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c) +{ + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (mca_cfg.monarch_timeout < 0) + mca_cfg.monarch_timeout = USEC_PER_SEC; } +} - if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { - /* - * All newer Zhaoxin CPUs support MCE broadcasting. Enable - * synchronization with a one second timeout. - */ - if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { - if (cfg->monarch_timeout < 0) - cfg->monarch_timeout = USEC_PER_SEC; - } +/* Add per CPU specific workarounds here */ +static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + switch (c->x86_vendor) { + case X86_VENDOR_UNKNOWN: + pr_info("unknown CPU type - not enabling MCE support\n"); + return false; + case X86_VENDOR_AMD: + apply_quirks_amd(c); + break; + case X86_VENDOR_INTEL: + apply_quirks_intel(c); + break; + case X86_VENDOR_ZHAOXIN: + apply_quirks_zhaoxin(c); + break; } if (cfg->monarch_timeout < 0) @@ -1957,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (cfg->bootlog != 0) cfg->panic_timeout = 30; - return 0; + return true; } -static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return 0; + return false; switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); mce_flags.p5 = 1; - return 1; + return true; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); mce_flags.winchip = 1; - return 1; + return true; default: - return 0; + return false; } - return 0; + return false; } /* @@ -2044,13 +2118,9 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_intel_feature_init(c); break; - case X86_VENDOR_AMD: { - mce_amd_feature_init(c); - break; - } - + case X86_VENDOR_AMD: case X86_VENDOR_HYGON: - mce_hygon_feature_init(c); + mce_amd_feature_init(c); break; case X86_VENDOR_CENTAUR: @@ -2224,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_cap_init(); - if (__mcheck_cpu_apply_quirks(c) < 0) { + if (!__mcheck_cpu_apply_quirks(c)) { mca_cfg.disabled = 1; return; } - if (mce_gen_pool_init()) { + if (!mce_gen_pool_init()) { mca_cfg.disabled = 1; pr_emerg("Couldn't allocate MCE records pool!\n"); return; @@ -2366,7 +2436,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2500,12 +2570,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr, return -EINVAL; b = &per_cpu(mce_banks_array, s->id)[bank]; - if (!b->init) return -ENODEV; b->ctl = new; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); return size; } @@ -2714,7 +2786,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } @@ -2748,7 +2820,7 @@ static int mce_cpu_pre_down(unsigned int cpu) struct timer_list *t = this_cpu_ptr(&mce_timer); mce_disable_cpu(); - del_timer_sync(t); + timer_delete_sync(t); mce_threshold_remove_device(cpu); mce_device_remove(cpu); return 0; diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index a05ac0716ecf..8d023239ce18 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, return put_user(sizeof(struct mce), p); case MCE_GET_LOG_LEN: return put_user(mcelog->len, p); - case MCE_GETCLEAR_FLAGS: { - unsigned flags; - - do { - flags = mcelog->flags; - } while (cmpxchg(&mcelog->flags, flags, 0) != flags); - - return put_user(flags, p); - } + case MCE_GETCLEAR_FLAGS: + return put_user(xchg(&mcelog->flags, 0), p); default: return -ENOTTY; } @@ -314,7 +307,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, /* * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * so do it a jiffy or two later everywhere. */ schedule_timeout(2); @@ -331,7 +324,6 @@ static const struct file_operations mce_chrdev_ops = { .poll = mce_chrdev_poll, .unlocked_ioctl = mce_chrdev_ioctl, .compat_ioctl = compat_ptr_ioctl, - .llseek = no_llseek, }; static struct miscdevice mce_chrdev_device = { diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index fbe8b61c3413..3ca9c007a666 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -16,14 +16,14 @@ * used to save error information organized in a lock-less list. * * This memory pool is only to be used to save MCE records in MCE context. - * MCE events are rare, so a fixed size memory pool should be enough. Use - * 2 pages to save MCE events for now (~80 MCE records at most). + * MCE events are rare, so a fixed size memory pool should be enough. + * Allocate on a sliding scale based on number of CPUs. */ -#define MCE_POOLSZ (2 * PAGE_SIZE) +#define MCE_MIN_ENTRIES 80 +#define MCE_PER_CPU 2 static struct gen_pool *mce_evt_pool; static LLIST_HEAD(mce_event_llist); -static char gen_pool_buf[MCE_POOLSZ]; /* * Compare the record "t" with each of the records on list "l" to see if @@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ]; */ static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l) { + struct mce_hw_err *err1, *err2; struct mce_evt_llist *node; - struct mce *m1, *m2; - m1 = &t->mce; + err1 = &t->err; llist_for_each_entry(node, &l->llnode, llnode) { - m2 = &node->mce; + err2 = &node->err; - if (!mce_cmp(m1, m2)) + if (!mce_cmp(&err1->m, &err2->m)) return true; } return false; @@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void) void mce_gen_pool_process(struct work_struct *__unused) { - struct llist_node *head; struct mce_evt_llist *node, *tmp; + struct llist_node *head; struct mce *mce; head = llist_del_all(&mce_event_llist); @@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused) head = llist_reverse_order(head); llist_for_each_entry_safe(node, tmp, head, llnode) { - mce = &node->mce; + mce = &node->err.m; blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); } @@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void) return llist_empty(&mce_event_llist); } -int mce_gen_pool_add(struct mce *mce) +bool mce_gen_pool_add(struct mce_hw_err *err) { struct mce_evt_llist *node; - if (filter_mce(mce)) - return -EINVAL; + if (filter_mce(&err->m)) + return false; if (!mce_evt_pool) - return -EINVAL; + return false; node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); if (!node) { pr_warn_ratelimited("MCE records pool full!\n"); - return -ENOMEM; + return false; } - memcpy(&node->mce, mce, sizeof(*mce)); + memcpy(&node->err, err, sizeof(*err)); llist_add(&node->llnode, &mce_event_llist); - return 0; + return true; } -static int mce_gen_pool_create(void) +static bool mce_gen_pool_create(void) { - struct gen_pool *tmpp; - int ret = -ENOMEM; - - tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); - if (!tmpp) - goto out; + int mce_numrecords, mce_poolsz, order; + struct gen_pool *gpool; + void *mce_pool; + + order = order_base_2(sizeof(struct mce_evt_llist)); + gpool = gen_pool_create(order, -1); + if (!gpool) + return false; + + mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU); + mce_poolsz = mce_numrecords * (1 << order); + mce_pool = kmalloc(mce_poolsz, GFP_KERNEL); + if (!mce_pool) { + gen_pool_destroy(gpool); + return false; + } - ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); - if (ret) { - gen_pool_destroy(tmpp); - goto out; + if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) { + gen_pool_destroy(gpool); + kfree(mce_pool); + return false; } - mce_evt_pool = tmpp; + mce_evt_pool = gpool; -out: - return ret; + return true; } -int mce_gen_pool_init(void) +bool mce_gen_pool_init(void) { /* Just init mce_gen_pool once. */ if (mce_evt_pool) - return 0; + return true; return mce_gen_pool_create(); } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 94953d749475..d02c4f556cd0 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -24,10 +24,11 @@ #include <linux/pci.h> #include <linux/uaccess.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/apic.h> #include <asm/irq_vectors.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -229,7 +230,6 @@ static int raise_local(void) } else if (m->status) { pr_info("Starting machine check poll CPU %d\n", cpu); raise_poll(m); - mce_notify_irq(); pr_info("Machine check poll done on CPU %d\n", cpu); } else m->finished = 0; @@ -476,30 +476,35 @@ static void prepare_msrs(void *info) struct mce m = *(struct mce *)info; u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + + if (m.misc) + wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + wrmsrq(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr); + + if (m.misc) + wrmsrq(MSR_IA32_MCx_MISC(b), m.misc); } } static void do_inject(void) { - u64 mcg_status = 0; unsigned int cpu = i_mce.extcpu; + struct mce_hw_err err; + u64 mcg_status = 0; u8 b = i_mce.bank; i_mce.tsc = rdtsc_ordered(); @@ -513,7 +518,8 @@ static void do_inject(void) i_mce.status |= MCI_STATUS_SYNDV; if (inj_type == SW_INJ) { - mce_log(&i_mce); + err.m = i_mce; + mce_log(&err); return; } @@ -584,7 +590,7 @@ static int inj_bank_set(void *data, u64 val) u64 cap; /* Get bank count on target CPU so we can handle non-uniform values. */ - rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { @@ -608,7 +614,7 @@ static int inj_bank_set(void *data, u64 val) if (cpu_feature_enabled(X86_FEATURE_SMCA)) { u64 ipid; - if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { pr_err("Error reading IPID on CPU%d\n", m->extcpu); return -EINVAL; } @@ -736,15 +742,15 @@ static void check_hw_inj_possible(void) u64 status = MCI_STATUS_VAL, ipid; /* Check whether bank is populated */ - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); if (!ipid) continue; toggle_hw_mce_inject(cpu, true); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); - rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; @@ -795,4 +801,5 @@ static void __exit inject_exit(void) module_init(inject_init); module_exit(inject_exit); +MODULE_DESCRIPTION("Machine check injection support"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 399b62e223d2..efcf21e9552e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -13,7 +13,7 @@ #include <linux/cpumask.h> #include <asm/apic.h> #include <asm/cpufeature.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -75,12 +75,12 @@ static u16 cmci_threshold[MAX_NR_BANKS]; */ #define CMCI_STORM_THRESHOLD 32749 -static int cmci_supported(int *banks) +static bool cmci_supported(int *banks) { u64 cap; if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) - return 0; + return false; /* * Vendor check is not strictly needed, but the initial @@ -89,12 +89,13 @@ static int cmci_supported(int *banks) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) - return 0; + return false; if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) - return 0; - rdmsrl(MSR_IA32_MCG_CAP, cap); - *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return false; + + rdmsrq(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -105,7 +106,7 @@ static bool lmce_supported(void) if (mca_cfg.lmce_disabled) return false; - rdmsrl(MSR_IA32_MCG_CAP, tmp); + rdmsrq(MSR_IA32_MCG_CAP, tmp); /* * LMCE depends on recovery support in the processor. Hence both @@ -122,7 +123,7 @@ static bool lmce_supported(void) * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEAT_CTL, tmp); + rdmsrq(MSR_IA32_FEAT_CTL, tmp); if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) return false; @@ -140,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh) u64 val; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -183,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val) if (test_bit(bank, mce_banks_ce_disabled)) return true; - rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), *val); /* Already owned by someone else? */ if (*val & MCI_CTL2_CMCI_EN) { @@ -231,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); /* If the enable bit did not stick, this bank should be polled. */ if (!(val & MCI_CTL2_CMCI_EN)) { @@ -323,9 +324,9 @@ static void __cmci_disable_bank(int bank) if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) @@ -429,10 +430,10 @@ void intel_init_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } void intel_clear_lmce(void) @@ -442,9 +443,9 @@ void intel_clear_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val); } /* @@ -455,14 +456,14 @@ static void intel_imc_init(struct cpuinfo_x86 *c) { u64 error_control; - switch (c->x86_model) { - case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE_X: - case INTEL_FAM6_HASWELL_X: - if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + switch (c->x86_vfm) { + case INTEL_SANDYBRIDGE_X: + case INTEL_IVYBRIDGE_X: + case INTEL_HASWELL_X: + if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; - wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + wrmsrq_safe(MSR_ERROR_CONTROL, error_control); break; } } @@ -484,12 +485,11 @@ bool intel_filter_mce(struct mce *m) struct cpuinfo_x86 *c = &boot_cpu_data; /* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */ - if ((c->x86 == 6) && - ((c->x86_model == INTEL_FAM6_HASWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_L) || - (c->x86_model == INTEL_FAM6_BROADWELL) || - (c->x86_model == INTEL_FAM6_HASWELL_G) || - (c->x86_model == INTEL_FAM6_SKYLAKE_X)) && + if ((c->x86_vfm == INTEL_HASWELL || + c->x86_vfm == INTEL_HASWELL_L || + c->x86_vfm == INTEL_BROADWELL || + c->x86_vfm == INTEL_HASWELL_G || + c->x86_vfm == INTEL_SKYLAKE_X) && (m->bank == 0) && ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005)) return true; diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 01f8f03969e6..b5ba598e54cb 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain; struct mce_evt_llist { struct llist_node llnode; - struct mce mce; + struct mce_hw_err err; }; void mce_gen_pool_process(struct work_struct *__unused); bool mce_gen_pool_empty(void); -int mce_gen_pool_add(struct mce *mce); -int mce_gen_pool_init(void); +bool mce_gen_pool_add(struct mce_hw_err *err); +bool mce_gen_pool_init(void); struct llist_node *mce_gen_pool_prepare_records(void); int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp); @@ -261,6 +261,8 @@ enum mca_msr { /* Decide whether to add MCE record to MCE event pool or filter it out. */ extern bool filter_mce(struct mce *m); +void mce_prep_record_common(struct mce *m); +void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m); #ifdef CONFIG_X86_MCE_AMD extern bool amd_filter_mce(struct mce *m); @@ -310,7 +312,7 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {} static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif -noinstr u64 mce_rdmsrl(u32 msr); +noinstr u64 mce_rdmsrq(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index c4477162c07d..2235a7477436 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -12,7 +12,7 @@ #include <linux/uaccess.h> #include <asm/mce.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/traps.h> #include <asm/insn.h> #include <asm/insn-eval.h> @@ -39,20 +39,20 @@ static struct severity { u64 mask; u64 result; unsigned char sev; - unsigned char mcgmask; - unsigned char mcgres; + unsigned short mcgmask; + unsigned short mcgres; unsigned char ser; unsigned char context; unsigned char excp; unsigned char covered; - unsigned char cpu_model; + unsigned int cpu_vfm; unsigned char cpu_minstepping; unsigned char bank_lo, bank_hi; char *msg; } severities[] = { #define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } #define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h -#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s +#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s #define KERNEL .context = IN_KERNEL #define USER .context = IN_USER #define KERNEL_RECOV .context = IN_KERNEL_RECOV @@ -128,7 +128,7 @@ static struct severity { MCESEV( AO, "Uncorrected Patrol Scrub Error", SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0), - MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18) + VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18) ), /* ignore OVER for UCNA */ @@ -174,6 +174,18 @@ static struct severity { USER ), MCESEV( + AR, "Data load error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( + AR, "Instruction fetch error in SEAM non-root mode", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR), + KERNEL + ), + MCESEV( PANIC, "Data load in unrecoverable area of kernel", SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL @@ -288,14 +300,12 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs) copy_user = is_copy_from_user(regs); instrumentation_end(); - switch (fixup_type) { - case EX_TYPE_UACCESS: - case EX_TYPE_COPY: - if (!copy_user) - return IN_KERNEL; - m->kflags |= MCE_IN_KERNEL_COPYIN; - fallthrough; + if (copy_user) { + m->kflags |= MCE_IN_KERNEL_COPYIN | MCE_IN_KERNEL_RECOV; + return IN_KERNEL_RECOV; + } + switch (fixup_type) { case EX_TYPE_FAULT_MCE_SAFE: case EX_TYPE_DEFAULT_MCE_SAFE: m->kflags |= MCE_IN_KERNEL_RECOV; @@ -386,7 +396,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char continue; if (s->excp && excp != s->excp) continue; - if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model) + if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm) continue; if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping) continue; diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c index 89e31e1e5c9c..f4a007616468 100644 --- a/arch/x86/kernel/cpu/mce/threshold.c +++ b/arch/x86/kernel/cpu/mce/threshold.c @@ -90,7 +90,7 @@ void cmci_storm_end(unsigned int bank) storm->banks[bank].in_storm_mode = false; /* If no banks left in storm mode, stop polling. */ - if (!this_cpu_dec_return(storm_desc.stormy_bank_count)) + if (!--storm->stormy_bank_count) mce_timer_kick(false); } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 13b45b9c806d..097e39327942 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -23,17 +23,22 @@ #include <linux/earlycpio.h> #include <linux/firmware.h> +#include <linux/bsearch.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/initrd.h> #include <linux/kernel.h> #include <linux/pci.h> +#include <crypto/sha2.h> + #include <asm/microcode.h> #include <asm/processor.h> +#include <asm/cmdline.h> #include <asm/setup.h> #include <asm/cpu.h> #include <asm/msr.h> +#include <asm/tlb.h> #include "internal.h" @@ -84,13 +89,36 @@ struct microcode_amd { unsigned int mpb[]; }; -#define PATCH_MAX_SIZE (3 * PAGE_SIZE) - static struct equiv_cpu_table { unsigned int num_entries; struct equiv_cpu_entry *entry; } equiv_table; +union zen_patch_rev { + struct { + __u32 rev : 8, + stepping : 4, + model : 4, + __reserved : 4, + ext_model : 4, + ext_fam : 8; + }; + __u32 ucode_rev; +}; + +union cpuid_1_eax { + struct { + __u32 stepping : 4, + model : 4, + family : 4, + __reserved0 : 4, + ext_model : 4, + ext_fam : 8, + __reserved1 : 4; + }; + __u32 full; +}; + /* * This points to the current valid container of microcode patches which we will * save from the initrd/builtin before jettisoning its contents. @mc is the @@ -98,7 +126,6 @@ static struct equiv_cpu_table { */ struct cont_desc { struct microcode_amd *mc; - u32 cpuid_1_eax; u32 psize; u8 *data; size_t size; @@ -111,10 +138,151 @@ struct cont_desc { static const char ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; +/* + * This is CPUID(1).EAX on the BSP. It is used in two ways: + * + * 1. To ignore the equivalence table on Zen1 and newer. + * + * 2. To match which patches to load because the patch revision ID + * already contains the f/m/s for which the microcode is destined + * for. + */ +static u32 bsp_cpuid_1_eax __ro_after_init; + +static bool sha_check = true; + +struct patch_digest { + u32 patch_id; + u8 sha256[SHA256_DIGEST_SIZE]; +}; + +#include "amd_shas.c" + +static int cmp_id(const void *key, const void *elem) +{ + struct patch_digest *pd = (struct patch_digest *)elem; + u32 patch_id = *(u32 *)key; + + if (patch_id == pd->patch_id) + return 0; + else if (patch_id < pd->patch_id) + return -1; + else + return 1; +} + +static bool need_sha_check(u32 cur_rev) +{ + switch (cur_rev >> 8) { + case 0x80012: return cur_rev <= 0x800126f; break; + case 0x80082: return cur_rev <= 0x800820f; break; + case 0x83010: return cur_rev <= 0x830107c; break; + case 0x86001: return cur_rev <= 0x860010e; break; + case 0x86081: return cur_rev <= 0x8608108; break; + case 0x87010: return cur_rev <= 0x8701034; break; + case 0x8a000: return cur_rev <= 0x8a0000a; break; + case 0xa0010: return cur_rev <= 0xa00107a; break; + case 0xa0011: return cur_rev <= 0xa0011da; break; + case 0xa0012: return cur_rev <= 0xa001243; break; + case 0xa0082: return cur_rev <= 0xa00820e; break; + case 0xa1011: return cur_rev <= 0xa101153; break; + case 0xa1012: return cur_rev <= 0xa10124e; break; + case 0xa1081: return cur_rev <= 0xa108109; break; + case 0xa2010: return cur_rev <= 0xa20102f; break; + case 0xa2012: return cur_rev <= 0xa201212; break; + case 0xa4041: return cur_rev <= 0xa404109; break; + case 0xa5000: return cur_rev <= 0xa500013; break; + case 0xa6012: return cur_rev <= 0xa60120a; break; + case 0xa7041: return cur_rev <= 0xa704109; break; + case 0xa7052: return cur_rev <= 0xa705208; break; + case 0xa7080: return cur_rev <= 0xa708009; break; + case 0xa70c0: return cur_rev <= 0xa70C009; break; + case 0xaa001: return cur_rev <= 0xaa00116; break; + case 0xaa002: return cur_rev <= 0xaa00218; break; + case 0xb0021: return cur_rev <= 0xb002146; break; + case 0xb1010: return cur_rev <= 0xb101046; break; + case 0xb2040: return cur_rev <= 0xb204031; break; + case 0xb4040: return cur_rev <= 0xb404031; break; + case 0xb6000: return cur_rev <= 0xb600031; break; + case 0xb7000: return cur_rev <= 0xb700031; break; + default: break; + } + + pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n"); + pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev); + return true; +} + +static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len) +{ + struct patch_digest *pd = NULL; + u8 digest[SHA256_DIGEST_SIZE]; + int i; + + if (x86_family(bsp_cpuid_1_eax) < 0x17) + return true; + + if (!need_sha_check(cur_rev)) + return true; + + if (!sha_check) + return true; + + pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id); + if (!pd) { + pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id); + return false; + } + + sha256(data, len, digest); + + if (memcmp(digest, pd->sha256, sizeof(digest))) { + pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); + + for (i = 0; i < SHA256_DIGEST_SIZE; i++) + pr_cont("0x%x ", digest[i]); + pr_info("\n"); + + return false; + } + + return true; +} + +static u32 get_patch_level(void) +{ + u32 rev, dummy __always_unused; + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + return rev; +} + +static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val) +{ + union zen_patch_rev p; + union cpuid_1_eax c; + + p.ucode_rev = val; + c.full = 0; + + c.stepping = p.stepping; + c.model = p.model; + c.ext_model = p.ext_model; + c.family = 0xf; + c.ext_fam = p.ext_fam; + + return c; +} + static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig) { unsigned int i; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return 0; + if (!et || !et->num_entries) return 0; @@ -161,6 +329,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) if (!verify_container(buf, buf_size)) return false; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return true; + cont_type = hdr[1]; if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) { pr_debug("Wrong microcode container equivalence table type: %u.\n", @@ -187,8 +359,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size) * On success, @sh_psize returns the patch size according to the section header, * to the caller. */ -static bool -__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) +static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) { u32 p_type, p_size; const u32 *hdr; @@ -224,12 +395,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize) * exceed the per-family maximum). @sh_psize is the size read from the section * header. */ -static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size) +static bool __verify_patch_size(u32 sh_psize, size_t buf_size) { + u8 family = x86_family(bsp_cpuid_1_eax); u32 max_size; if (family >= 0x15) - return min_t(u32, sh_psize, buf_size); + goto ret; #define F1XH_MPB_MAX_SIZE 2048 #define F14H_MPB_MAX_SIZE 1824 @@ -243,13 +415,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size break; default: WARN(1, "%s: WTF family: 0x%x\n", __func__, family); - return 0; + return false; } - if (sh_psize > min_t(u32, buf_size, max_size)) - return 0; + if (sh_psize > max_size) + return false; - return sh_psize; +ret: + /* Working with the whole buffer so < is ok. */ + return sh_psize <= buf_size; } /* @@ -260,11 +434,10 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size * positive: patch is not for this family, skip it * 0: success */ -static int -verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) +static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size) { + u8 family = x86_family(bsp_cpuid_1_eax); struct microcode_header_amd *mc_hdr; - unsigned int ret; u32 sh_psize; u16 proc_id; u8 patch_fam; @@ -288,8 +461,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return -1; } - ret = __verify_patch_size(family, sh_psize, buf_size); - if (!ret) { + if (!__verify_patch_size(sh_psize, buf_size)) { pr_debug("Per-family patch size mismatch.\n"); return -1; } @@ -310,10 +482,19 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size) return 0; } +static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id) +{ + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax; + else + return eq_id == mc->hdr.processor_rev_id; +} + /* * This scans the ucode blob for the proper container as we can have multiple - * containers glued together. Returns the equivalence ID from the equivalence - * table or 0 if none found. + * containers glued together. + * * Returns the amount of bytes consumed while scanning. @desc contains all the * data we're going to use in later stages of the application. */ @@ -338,7 +519,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) * doesn't contain a patch for the CPU, scan through the whole container * so that it can be skipped in case there are other containers appended. */ - eq_id = find_equiv_id(&table, desc->cpuid_1_eax); + eq_id = find_equiv_id(&table, bsp_cpuid_1_eax); buf += hdr[2] + CONTAINER_HDR_SZ; size -= hdr[2] + CONTAINER_HDR_SZ; @@ -352,7 +533,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) u32 patch_size; int ret; - ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size); + ret = verify_patch(buf, size, &patch_size); if (ret < 0) { /* * Patch verification failed, skip to the next container, if @@ -365,7 +546,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc) } mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE); - if (eq_id == mc->hdr.processor_rev_id) { + if (mc_patch_matches(mc, eq_id)) { desc->psize = patch_size; desc->mc = mc; } @@ -415,59 +596,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc) } } -static int __apply_microcode_amd(struct microcode_amd *mc) +static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, + unsigned int psize) { - u32 rev, dummy; + unsigned long p_addr = (unsigned long)&mc->hdr.data_code; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code); + if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) + return false; - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc->hdr.patch_id) - return -1; + native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr); - return 0; -} + if (x86_family(bsp_cpuid_1_eax) == 0x17) { + unsigned long p_addr_end = p_addr + psize - 1; -/* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. - * - * Returns true if container found (sets @desc), false otherwise. - */ -static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size) -{ - struct cont_desc desc = { 0 }; - struct microcode_amd *mc; - bool ret = false; - - desc.cpuid_1_eax = cpuid_1_eax; - - scan_containers(ucode, size, &desc); + invlpg(p_addr); - mc = desc.mc; - if (!mc) - return ret; + /* + * Flush next page too if patch image is crossing a page + * boundary. + */ + if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT) + invlpg(p_addr_end); + } - /* - * Allow application of the same revision to pick up SMT-specific - * changes even if the revision of the other SMT thread is already - * up-to-date. - */ - if (old_rev > mc->hdr.patch_id) - return ret; + /* verify patch application was successful */ + *cur_rev = get_patch_level(); + if (*cur_rev != mc->hdr.patch_id) + return false; - return !__apply_microcode_amd(mc); + return true; } -static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp) { char fw_name[36] = "amd-ucode/microcode_amd.bin"; + u8 family = x86_family(bsp_cpuid_1_eax); struct firmware fw; if (IS_ENABLED(CONFIG_X86_32)) @@ -486,85 +649,144 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) return false; } -static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret) +static bool __init find_blobs_in_containers(struct cpio_data *ret) { struct cpio_data cp; + bool found; - if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax))) + if (!get_builtin_microcode(&cp)) cp = find_microcode_in_initrd(ucode_path); - *ret = cp; + found = cp.data && cp.size; + if (found) + *ret = cp; + + return found; } +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax) { + struct cont_desc desc = { }; + struct microcode_amd *mc; struct cpio_data cp = { }; - u32 dummy; + char buf[4]; + u32 rev; + + if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) { + if (!strncmp(buf, "off", 3)) { + sha_check = false; + pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + } - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy); + bsp_cpuid_1_eax = cpuid_1_eax; + + rev = get_patch_level(); + ed->old_rev = rev; /* Needed in load_microcode_amd() */ ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax; - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) + if (!find_blobs_in_containers(&cp)) return; - if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size)) - native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy); -} - -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size); - -static int __init save_microcode_in_initrd(void) -{ - unsigned int cpuid_1_eax = native_cpuid_eax(1); - struct cpuinfo_x86 *c = &boot_cpu_data; - struct cont_desc desc = { 0 }; - enum ucode_state ret; - struct cpio_data cp; - - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) - return 0; + scan_containers(cp.data, cp.size, &desc); - find_blobs_in_containers(cpuid_1_eax, &cp); - if (!(cp.data && cp.size)) - return -EINVAL; + mc = desc.mc; + if (!mc) + return; - desc.cpuid_1_eax = cpuid_1_eax; + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (ed->old_rev > mc->hdr.patch_id) + return; - scan_containers(cp.data, cp.size, &desc); - if (!desc.mc) - return -EINVAL; + if (__apply_microcode_amd(mc, &rev, desc.psize)) + ed->new_rev = rev; +} - ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret > UCODE_UPDATED) - return -EINVAL; +static inline bool patch_cpus_equivalent(struct ucode_patch *p, + struct ucode_patch *n, + bool ignore_stepping) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id); + union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id); + + if (ignore_stepping) { + p_cid.stepping = 0; + n_cid.stepping = 0; + } - return 0; + return p_cid.full == n_cid.full; + } else { + return p->equiv_cpu == n->equiv_cpu; + } } -early_initcall(save_microcode_in_initrd); /* * a small, trivial cache of per-family ucode patches */ -static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu) { struct ucode_patch *p; + struct ucode_patch n; + + n.equiv_cpu = equiv_cpu; + n.patch_id = uci->cpu_sig.rev; + + WARN_ON_ONCE(!n.patch_id); list_for_each_entry(p, µcode_cache, plist) - if (p->equiv_cpu == equiv_cpu) + if (patch_cpus_equivalent(p, &n, false)) return p; + return NULL; } +static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n) +{ + /* Zen and newer hardcode the f/m/s in the patch ID */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) { + union zen_patch_rev zp, zn; + + zp.ucode_rev = p->patch_id; + zn.ucode_rev = n->patch_id; + + if (zn.stepping != zp.stepping) + return -1; + + return zn.rev > zp.rev; + } else { + return n->patch_id > p->patch_id; + } +} + static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; + int ret; list_for_each_entry(p, µcode_cache, plist) { - if (p->equiv_cpu == new_patch->equiv_cpu) { - if (p->patch_id >= new_patch->patch_id) { + if (patch_cpus_equivalent(p, new_patch, true)) { + ret = patch_newer(p, new_patch); + if (ret < 0) + continue; + else if (!ret) { /* we already have the latest patch */ kfree(new_patch->data); kfree(new_patch); @@ -595,13 +817,17 @@ static void free_cache(void) static struct ucode_patch *find_patch(unsigned int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u16 equiv_id; + u16 equiv_id = 0; - equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); - if (!equiv_id) - return NULL; + uci->cpu_sig.rev = get_patch_level(); - return cache_find_patch(equiv_id); + if (x86_family(bsp_cpuid_1_eax) < 0x17) { + equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig); + if (!equiv_id) + return NULL; + } + + return cache_find_patch(uci, equiv_id); } void reload_ucode_amd(unsigned int cpu) @@ -616,22 +842,20 @@ void reload_ucode_amd(unsigned int cpu) mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - + rev = get_patch_level(); if (rev < mc->hdr.patch_id) { - if (!__apply_microcode_amd(mc)) - pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id); + if (__apply_microcode_amd(mc, &rev, p->size)) + pr_info_once("reload revision: 0x%08x\n", rev); } } static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { - struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct ucode_patch *p; csig->sig = cpuid_eax(0x00000001); - csig->rev = c->microcode; + csig->rev = get_patch_level(); /* * a patch could have been loaded early, set uci->mc so that @@ -651,7 +875,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy __always_unused; + u32 rev; BUG_ON(raw_smp_processor_id() != cpu); @@ -661,18 +885,18 @@ static enum ucode_state apply_microcode_amd(int cpu) if (!p) return UCODE_NFOUND; + rev = uci->cpu_sig.rev; + mc_amd = p->data; uci->mc = p->data; - rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* need to apply patch? */ if (rev > mc_amd->hdr.patch_id) { ret = UCODE_OK; goto out; } - if (__apply_microcode_amd(mc_amd)) { + if (!__apply_microcode_amd(mc_amd, &rev, p->size)) { pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return UCODE_ERROR; @@ -711,6 +935,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) hdr = (const u32 *)buf; equiv_tbl_len = hdr[2]; + /* Zen and newer do not need an equivalence table. */ + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + goto out; + equiv_table.entry = vmalloc(equiv_tbl_len); if (!equiv_table.entry) { pr_err("failed to allocate equivalent CPU table\n"); @@ -720,12 +948,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size) memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len); equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry); +out: /* add header length */ return equiv_tbl_len + CONTAINER_HDR_SZ; } static void free_equiv_cpu_table(void) { + if (x86_family(bsp_cpuid_1_eax) >= 0x17) + return; + vfree(equiv_table.entry); memset(&equiv_table, 0, sizeof(equiv_table)); } @@ -751,7 +983,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, u16 proc_id; int ret; - ret = verify_patch(family, fw, leftover, patch_size); + ret = verify_patch(fw, leftover, patch_size); if (ret) return ret; @@ -776,7 +1008,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, patch->patch_id = mc_hdr->patch_id; patch->equiv_cpu = proc_id; - pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n", __func__, patch->patch_id, proc_id); /* ... and add to cache. */ @@ -786,8 +1018,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover, } /* Scan the blob in @data and add microcode patches to the cache. */ -static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, - size_t size) +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size) { u8 *fw = (u8 *)data; size_t offset; @@ -820,23 +1051,32 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size) { - struct cpuinfo_x86 *c; - unsigned int nid, cpu; - struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - if (ret != UCODE_OK) { + if (ret != UCODE_OK) cleanup(); + + return ret; +} + +static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + struct cpuinfo_x86 *c; + unsigned int nid, cpu; + struct ucode_patch *p; + enum ucode_state ret; + + ret = _load_microcode_amd(family, data, size); + if (ret != UCODE_OK) return ret; - } - for_each_node(nid) { + for_each_node_with_cpus(nid) { cpu = cpumask_first(cpumask_of_node(nid)); c = &cpu_data(cpu); @@ -853,6 +1093,34 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz return ret; } +static int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; + enum ucode_state ret; + struct cpio_data cp; + + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + return 0; + + cpuid_1_eax = native_cpuid_eax(1); + + if (!find_blobs_in_containers(&cp)) + return -EINVAL; + + scan_containers(cp.data, cp.size, &desc); + if (!desc.mc) + return -EINVAL; + + ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size); + if (ret > UCODE_UPDATED) + return -EINVAL; + + return 0; +} +early_initcall(save_microcode_in_initrd); + /* * AMD microcode firmware naming convention, up to family 15h they are in * the legacy file: @@ -907,11 +1175,18 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +static void finalize_late_load_amd(int result) +{ + if (result) + cleanup(); +} + static struct microcode_ops microcode_amd_ops = { .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, + .finalize_late_load = finalize_late_load_amd, .nmi_safe = true, }; diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c new file mode 100644 index 000000000000..2a1655b1fdd8 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_shas.c @@ -0,0 +1,444 @@ +/* Keep 'em sorted. */ +static const struct patch_digest phashes[] = { + { 0x8001227, { + 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b, + 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46, + 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8, + 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18, + } + }, + { 0x8001250, { + 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60, + 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa, + 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3, + 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19, + } + }, + { 0x800126e, { + 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c, + 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3, + 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6, + 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43, + } + }, + { 0x800126f, { + 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec, + 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b, + 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18, + 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33, + } + }, + { 0x800820d, { + 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59, + 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67, + 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c, + 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e, + } + }, + { 0x8301025, { + 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36, + 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1, + 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30, + 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77, + } + }, + { 0x8301055, { + 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a, + 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea, + 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b, + 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b, + } + }, + { 0x8301072, { + 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e, + 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28, + 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1, + 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30, + } + }, + { 0x830107a, { + 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72, + 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34, + 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20, + 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a, + } + }, + { 0x830107b, { + 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad, + 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a, + 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04, + 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19, + } + }, + { 0x830107c, { + 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47, + 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac, + 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3, + 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38, + } + }, + { 0x860010d, { + 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0, + 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7, + 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c, + 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8, + } + }, + { 0x8608108, { + 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2, + 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38, + 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc, + 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd, + } + }, + { 0x8701034, { + 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83, + 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d, + 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8, + 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b, + } + }, + { 0x8a00008, { + 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e, + 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e, + 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72, + 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c, + } + }, + { 0x8a0000a, { + 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c, + 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44, + 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb, + 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20, + } + }, + { 0xa00104c, { + 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe, + 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76, + 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b, + 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b, + } + }, + { 0xa00104e, { + 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2, + 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0, + 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e, + 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b, + } + }, + { 0xa001053, { + 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d, + 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25, + 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb, + 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3, + } + }, + { 0xa001058, { + 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36, + 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19, + 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec, + 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2, + } + }, + { 0xa001075, { + 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9, + 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a, + 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f, + 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0, + } + }, + { 0xa001078, { + 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25, + 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce, + 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04, + 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e, + } + }, + { 0xa001079, { + 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb, + 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93, + 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb, + 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a, + } + }, + { 0xa00107a, { + 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f, + 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd, + 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f, + 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18, + } + }, + { 0xa001143, { + 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80, + 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42, + 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d, + 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8, + } + }, + { 0xa001144, { + 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41, + 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b, + 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25, + 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc, + } + }, + { 0xa00115d, { + 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd, + 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75, + 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e, + 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05, + } + }, + { 0xa001173, { + 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a, + 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35, + 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3, + 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e, + } + }, + { 0xa0011a8, { + 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b, + 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6, + 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4, + 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e, + } + }, + { 0xa0011ce, { + 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71, + 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86, + 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e, + 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a, + } + }, + { 0xa0011d1, { + 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e, + 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09, + 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5, + 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8, + } + }, + { 0xa0011d3, { + 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b, + 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6, + 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00, + 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26, + } + }, + { 0xa0011d5, { + 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13, + 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7, + 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62, + 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21, + } + }, + { 0xa001223, { + 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8, + 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4, + 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15, + 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45, + } + }, + { 0xa001224, { + 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25, + 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad, + 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9, + 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a, + } + }, + { 0xa001227, { + 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad, + 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d, + 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9, + 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0, + } + }, + { 0xa001229, { + 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6, + 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75, + 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4, + 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d, + } + }, + { 0xa00122e, { + 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf, + 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15, + 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa, + 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10, + } + }, + { 0xa001231, { + 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e, + 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64, + 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b, + 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd, + } + }, + { 0xa001234, { + 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7, + 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc, + 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b, + 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a, + } + }, + { 0xa001236, { + 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78, + 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d, + 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b, + 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25, + } + }, + { 0xa001238, { + 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc, + 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a, + 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e, + 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59, + } + }, + { 0xa00820c, { + 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3, + 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63, + 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1, + 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2, + } + }, + { 0xa10113e, { + 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10, + 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0, + 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5, + 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53, + } + }, + { 0xa101144, { + 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26, + 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09, + 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc, + 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47, + } + }, + { 0xa101148, { + 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90, + 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f, + 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08, + 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4, + } + }, + { 0xa10123e, { + 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18, + 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d, + 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc, + 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b, + } + }, + { 0xa101244, { + 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c, + 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1, + 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72, + 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0, + } + }, + { 0xa101248, { + 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e, + 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22, + 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e, + 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75, + } + }, + { 0xa108108, { + 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9, + 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6, + 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c, + 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16, + } + }, + { 0xa20102d, { + 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11, + 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89, + 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23, + 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4, + } + }, + { 0xa201210, { + 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe, + 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9, + 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68, + 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41, + } + }, + { 0xa404107, { + 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45, + 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0, + 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81, + 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99, + } + }, + { 0xa500011, { + 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4, + 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1, + 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2, + 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74, + } + }, + { 0xa601209, { + 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32, + 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30, + 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46, + 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d, + } + }, + { 0xa704107, { + 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6, + 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93, + 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2, + 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39, + } + }, + { 0xa705206, { + 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4, + 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7, + 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b, + 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc, + } + }, + { 0xa708007, { + 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3, + 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2, + 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80, + 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93, + } + }, + { 0xa70c005, { + 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b, + 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f, + 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55, + 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13, + } + }, + { 0xaa00116, { + 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63, + 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5, + 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d, + 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9, + } + }, + { 0xaa00212, { + 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75, + 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71, + 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2, + 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1, + } + }, + { 0xaa00213, { + 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a, + 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f, + 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed, + 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b, + } + }, + { 0xaa00215, { + 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9, + 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4, + 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b, + 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef, + } + }, +}; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 232026a239a6..fe50eb5b7c4a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -37,12 +37,13 @@ #include <asm/perf_event.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/msr.h> #include <asm/setup.h> #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = false; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); module_param(force_minrev, bool, S_IRUSR | S_IWUSR); @@ -60,11 +61,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR); */ struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -struct cpu_info_ctx { - struct cpu_signature *cpu_sig; - int err; -}; - /* * Those patch levels cannot be updated to newer ones and thus should be final. */ @@ -89,6 +85,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -100,27 +99,29 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; - - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } - - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!cpuid_feature() || + native_cpuid_ecx(1) & BIT(31) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; return dis_ucode_ldr; } @@ -130,7 +131,10 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -151,9 +155,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -164,6 +165,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -691,6 +697,8 @@ static int load_late_locked(void) return load_late_stop_cpus(true); case UCODE_NFOUND: return -ENOENT; + case UCODE_OK: + return 0; default: return -EBADFD; } @@ -815,7 +823,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h new file mode 100644 index 000000000000..cb6e601701ab --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -0,0 +1,150 @@ +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 }, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5f0414452b67..371ca6eac00e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -21,7 +21,7 @@ #include <linux/uio.h> #include <linux/mm.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/processor.h> #include <asm/tlbflush.h> #include <asm/setup.h> @@ -74,7 +74,7 @@ void intel_collect_cpu_info(struct cpu_signature *sig) sig->pf = 0; sig->rev = intel_get_microcode_revision(); - if (x86_model(sig->sig) >= 5 || x86_family(sig->sig) > 6) { + if (IFM(x86_family(sig->sig), x86_model(sig->sig)) >= INTEL_PENTIUM_III_DESCHUTES) { unsigned int val[2]; /* get processor flags from MSR 0x17 */ @@ -319,14 +319,8 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, return UCODE_OK; } - /* - * Writeback and invalidate caches before updating microcode to avoid - * internal issues depending on what the microcode is updating. - */ - native_wbinvd(); - /* write microcode via MSR 0x79 */ - native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) @@ -395,7 +389,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); @@ -574,15 +568,14 @@ static bool is_blacklisted(unsigned int cpu) /* * Late loading on model 79 with microcode revision less than 0x0b000021 * and LLC size per core bigger than 2.5MB may result in a system hang. - * This behavior is documented in item BDF90, #334165 (Intel Xeon + * This behavior is documented in item BDX90, #334165 (Intel Xeon * Processor E7-8800/4800 v4 Product Family). */ - if (c->x86 == 6 && - c->x86_model == INTEL_FAM6_BROADWELL_X && + if (c->x86_vfm == INTEL_BROADWELL_X && c->x86_stepping == 0x01 && llc_size_per_core > 2621440 && c->microcode < 0x0b000021) { - pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); + pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode); pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n"); return true; } diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 21776c529fa9..50a9702ae4e2 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -94,20 +94,17 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family); void load_ucode_amd_ap(unsigned int family); -int save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(unsigned int cpu); struct microcode_ops *init_amd_microcode(void); void exit_amd_microcode(void); #else /* CONFIG_CPU_SUP_AMD */ static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { } static inline void load_ucode_amd_ap(unsigned int family) { } -static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } static inline void reload_ucode_amd(unsigned int cpu) { } static inline struct microcode_ops *init_amd_microcode(void) { return NULL; } static inline void exit_amd_microcode(void) { } diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 1db560ed2ca3..68f537347466 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -30,8 +30,7 @@ dump_array() # If the /* comment */ starts with a quote string, grab that. VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" - [ -z "$VALUE" ] && VALUE="\"$NAME\"" - [ "$VALUE" = '""' ] && continue + [ ! "$VALUE" ] && continue # Name is uppercase, VALUE is all lowercase VALUE="$(echo "$VALUE" | tr A-Z a-z)" diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index e0fd57a8ba84..c78f860419d6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -16,11 +16,10 @@ #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/kexec.h> -#include <linux/i8253.h> #include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/desc.h> #include <asm/idtentry.h> @@ -31,11 +30,10 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/msr.h> #include <asm/numa.h> #include <asm/svm.h> -/* Is Linux running as the root partition? */ -bool hv_root_partition; /* Is Linux running on nested Microsoft Hypervisor */ bool hv_nested; struct ms_hyperv_info ms_hyperv; @@ -73,7 +71,7 @@ u64 hv_get_non_nested_msr(unsigned int reg) if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else - rdmsrl(reg, value); + rdmsrq(reg, value); return value; } EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); @@ -85,9 +83,9 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_msr(reg)) - wrmsrl(reg, value | 1 << 20); + wrmsrq(reg, value | 1 << 20); } else { - wrmsrl(reg, value); + wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); @@ -110,6 +108,7 @@ void hv_set_msr(unsigned int reg, u64 value) } EXPORT_SYMBOL_GPL(hv_set_msr); +static void (*mshv_handler)(void); static void (*vmbus_handler)(void); static void (*hv_stimer0_handler)(void); static void (*hv_kexec_handler)(void); @@ -120,6 +119,9 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) struct pt_regs *old_regs = set_irq_regs(regs); inc_irq_stat(irq_hv_callback_count); + if (mshv_handler) + mshv_handler(); + if (vmbus_handler) vmbus_handler(); @@ -129,6 +131,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback) set_irq_regs(old_regs); } +void hv_setup_mshv_handler(void (*handler)(void)) +{ + mshv_handler = handler; +} + void hv_setup_vmbus_handler(void (*handler)(void)) { vmbus_handler = handler; @@ -199,8 +206,8 @@ static void hv_machine_shutdown(void) * Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor * corrupts the old VP Assist Pages and can crash the kexec kernel. */ - if (kexec_in_progress && hyperv_init_cpuhp > 0) - cpuhp_remove_state(hyperv_init_cpuhp); + if (kexec_in_progress) + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); /* The function calls stop_other_cpus(). */ native_machine_shutdown(); @@ -224,6 +231,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs) hyperv_cleanup(); } #endif /* CONFIG_CRASH_DUMP */ + +static u64 hv_ref_counter_at_suspend; +static void (*old_save_sched_clock_state)(void); +static void (*old_restore_sched_clock_state)(void); + +/* + * Hyper-V clock counter resets during hibernation. Save and restore clock + * offset during suspend/resume, while also considering the time passed + * before suspend. This is to make sure that sched_clock using hv tsc page + * based clocksource, proceeds from where it left off during suspend and + * it shows correct time for the timestamps of kernel messages after resume. + */ +static void save_hv_clock_tsc_state(void) +{ + hv_ref_counter_at_suspend = hv_read_reference_counter(); +} + +static void restore_hv_clock_tsc_state(void) +{ + /* + * Adjust the offsets used by hv tsc clocksource to + * account for the time spent before hibernation. + * adjusted value = reference counter (time) at suspend + * - reference counter (time) now. + */ + hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); +} + +/* + * Functions to override save_sched_clock_state and restore_sched_clock_state + * functions of x86_platform. The Hyper-V clock counter is reset during + * suspend-resume and the offset used to measure time needs to be + * corrected, post resume. + */ +static void hv_save_sched_clock_state(void) +{ + old_save_sched_clock_state(); + save_hv_clock_tsc_state(); +} + +static void hv_restore_sched_clock_state(void) +{ + restore_hv_clock_tsc_state(); + old_restore_sched_clock_state(); +} + +static void __init x86_setup_ops_for_tsc_pg_clock(void) +{ + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return; + + old_save_sched_clock_state = x86_platform.save_sched_clock_state; + x86_platform.save_sched_clock_state = hv_save_sched_clock_state; + + old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; + x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; +} #endif /* CONFIG_HYPERV */ static uint32_t __init ms_hyperv_platform(void) @@ -282,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void) { unsigned long freq; - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); return freq / 1000; } @@ -366,6 +430,7 @@ int hv_get_hypervisor_version(union hv_hypervisor_version_info *info) return 0; } +EXPORT_SYMBOL_GPL(hv_get_hypervisor_version); static void __init ms_hyperv_init_platform(void) { @@ -380,13 +445,15 @@ static void __init ms_hyperv_init_platform(void) */ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES); + ms_hyperv.ext_features = cpuid_ecx(HYPERV_CPUID_FEATURES); ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); hv_max_functions_eax = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS); - pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n", - ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints, + pr_info("Hyper-V: privilege flags low %#x, high %#x, ext %#x, hints %#x, misc %#x\n", + ms_hyperv.features, ms_hyperv.priv_high, + ms_hyperv.ext_features, ms_hyperv.hints, ms_hyperv.misc_features); ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS); @@ -395,25 +462,7 @@ static void __init ms_hyperv_init_platform(void) pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); - /* - * Check CPU management privilege. - * - * To mirror what Windows does we should extract CPU management - * features and use the ReservedIdentityBit to detect if Linux is the - * root partition. But that requires negotiating CPU management - * interface (a process to be finalized). For now, use the privilege - * flag as the indicator for running as root. - * - * Hyper-V should never specify running as root and as a Confidential - * VM. But to protect against a compromised/malicious Hyper-V trying - * to exploit root behavior to expose Confidential VM memory, ignore - * the root partition setting if also a Confidential VM. - */ - if ((ms_hyperv.priv_high & HV_CPU_MANAGEMENT) && - !(ms_hyperv.priv_high & HV_ISOLATION)) { - hv_root_partition = true; - pr_info("Hyper-V: running as root partition\n"); - } + hv_identify_partition_type(); if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) { hv_nested = true; @@ -424,6 +473,7 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { x86_platform.calibrate_tsc = hv_get_tsc_khz; x86_platform.calibrate_cpu = hv_get_tsc_khz; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); } if (ms_hyperv.priv_high & HV_ISOLATION) { @@ -449,9 +499,23 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; if (!ms_hyperv.paravisor_present) { - /* To be supported: more work is required. */ + /* + * Mark the Hyper-V TSC page feature as disabled + * in a TDX VM without paravisor so that the + * Invariant TSC, which is a better clocksource + * anyway, is used instead. + */ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + /* + * The Invariant TSC is expected to be available + * in a TDX VM without paravisor, but if not, + * print a warning message. The slower Hyper-V MSR-based + * Ref Counter should end up being the clocksource. + */ + if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + pr_warn("Hyper-V: Invariant TSC is unavailable\n"); + /* HV_MSR_CRASH_CTL is unsupported. */ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; @@ -478,7 +542,7 @@ static void __init ms_hyperv_init_platform(void) */ u64 hv_lapic_frequency; - rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", @@ -511,7 +575,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } @@ -522,16 +586,6 @@ static void __init ms_hyperv_init_platform(void) if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; - /* - * Hyper-V VMs have a PIT emulation quirk such that zeroing the - * counter register during PIT shutdown restarts the PIT. So it - * continues to interrupt @18.2 HZ. Setting i8253_clear_counter - * to false tells pit_shutdown() not to zero the counter so that - * the PIT really is shutdown. Generation 2 VMs don't have a PIT, - * and setting this value has no effect. - */ - i8253_clear_counter_on_shutdown = false; - #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || ms_hyperv.paravisor_present) @@ -557,7 +611,7 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition || + if (hv_root_partition() || (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif @@ -575,6 +629,7 @@ static void __init ms_hyperv_init_platform(void) /* Register Hyper-V specific clocksource */ hv_init_clocksource(); + x86_setup_ops_for_tsc_pg_clock(); hv_vtl_init_platform(); #endif /* diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 422a4ddc2ab7..8c18327eb10b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,9 +9,11 @@ #include <linux/io.h> #include <linux/mm.h> #include <linux/cc_platform.h> +#include <linux/string_choices.h> #include <asm/processor-flags.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpu_device_id.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> @@ -108,7 +110,7 @@ static inline void k8_check_syscfg_dram_mod_en(void) (boot_cpu_data.x86 >= 0x0f))) return; - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) return; rdmsr(MSR_AMD64_SYSCFG, lo, hi); @@ -423,7 +425,7 @@ void __init mtrr_copy_map(void) } /** - * mtrr_overwrite_state - set static MTRR state + * guest_force_mtrr_state - set static MTRR state for a guest * * Used to set MTRR state via different means (e.g. with data obtained from * a hypervisor). @@ -436,8 +438,8 @@ void __init mtrr_copy_map(void) * @num_var: length of the @var array * @def_type: default caching type */ -void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var, - mtrr_type def_type) +void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var, + mtrr_type def_type) { unsigned int i; @@ -591,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } @@ -646,10 +648,10 @@ static void __init print_mtrr_state(void) pr_info("MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type)); if (mtrr_state.have_fixed) { - pr_info("MTRR fixed ranges %sabled:\n", - ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && - (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ? - "en" : "dis"); + pr_info("MTRR fixed ranges %s:\n", + str_enabled_disabled( + (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) && + (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED))); print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); for (i = 0; i < 2; ++i) print_fixed(0x80000 + i * 0x20000, 0x04000, @@ -661,8 +663,8 @@ static void __init print_mtrr_state(void) /* tail */ print_fixed_last(); } - pr_info("MTRR variable ranges %sabled:\n", - mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis"); + pr_info("MTRR variable ranges %s:\n", + str_enabled_disabled(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED)); high_width = (boot_cpu_data.x86_phys_bits - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { @@ -1025,8 +1027,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, * For Intel PPro stepping <= 7 * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF */ - if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && + if (mtrr_if == &generic_mtrr_ops && boot_cpu_data.x86_vfm == INTEL_PENTIUM_PRO && boot_cpu_data.x86_stepping <= 7) { if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { pr_warn("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7f..4049235b1bfe 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c index 767bf1c71aad..ecbda0341a8a 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.c +++ b/arch/x86/kernel/cpu/mtrr/mtrr.c @@ -55,6 +55,12 @@ #include "mtrr.h" +static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE); +static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB); +static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH); +static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT); +static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK); + /* arch_phys_wc_add returns an MTRR register index plus this offset. */ #define MTRR_TO_PHYS_WC_OFFSET 1000 @@ -609,7 +615,7 @@ void mtrr_save_state(void) { int first_cpu; - if (!mtrr_enabled()) + if (!mtrr_enabled() || !mtrr_state.have_fixed) return; first_cpu = cpumask_first(cpu_online_mask); @@ -619,7 +625,7 @@ void mtrr_save_state(void) static int __init mtrr_init_finalize(void) { /* - * Map might exist if mtrr_overwrite_state() has been called or if + * Map might exist if guest_force_mtrr_state() has been called or if * mtrr_enabled() returns true. */ mtrr_copy_map(); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index e65fae63660e..6571d432cbe3 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)), + str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)), + str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), + str_yes_no(boot_cpu_has(X86_FEATURE_FPU)), c->cpuid_level); } #else @@ -86,9 +86,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { - unsigned int freq = arch_freq_get_on_cpu(cpu); + int freq = arch_freq_get_on_cpu(cpu); - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); + if (freq < 0) + seq_puts(m, "cpu MHz\t\t: Unknown\n"); + else + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } /* Cache size */ diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 4a06c37b9cf1..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o -obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o +obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o +obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 83e40341583e..187d527ef73b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -19,10 +19,10 @@ #include <linux/cpu.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/cacheinfo.h> #include <linux/cpuhotplug.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -45,39 +45,28 @@ static DEFINE_MUTEX(domain_list_lock); DEFINE_PER_CPU(struct resctrl_pqr_state, pqr_state); /* - * Used to store the max resource name width and max resource data width - * to display the schemata in a tabular format - */ -int max_name_width, max_data_width; - -/* * Global boolean for rdt_alloc which is true if any * resource allocation is enabled. */ bool rdt_alloc_capable; -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); +static void mba_wrmsr_intel(struct msr_param *m); +static void cat_wrmsr(struct msr_param *m); +static void mba_wrmsr_amd(struct msr_param *m); -#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains) +#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains) +#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains) -struct rdt_hw_resource rdt_resources_all[] = { +struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_L3), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .ctrl_scope = RESCTRL_L3_CACHE, + .mon_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3), + .mon_domains = mon_domain_init(RDT_RESOURCE_L3), + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L3_CBM_BASE, .msr_update = cat_wrmsr, @@ -85,13 +74,10 @@ struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", - .cache_level = 2, - .domains = domain_init(RDT_RESOURCE_L2), - .parse_ctrlval = parse_cbm, - .format_str = "%d=%0*x", - .fflags = RFTYPE_RES_CACHE, + .ctrl_scope = RESCTRL_L2_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), + .schema_fmt = RESCTRL_SCHEMA_BITMAP, }, .msr_base = MSR_IA32_L2_CBM_BASE, .msr_update = cat_wrmsr, @@ -99,29 +85,39 @@ struct rdt_hw_resource rdt_resources_all[] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_MBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", - .cache_level = 3, - .domains = domain_init(RDT_RESOURCE_SMBA), - .parse_ctrlval = parse_bw, - .format_str = "%d=%*u", - .fflags = RFTYPE_RES_MB, + .ctrl_scope = RESCTRL_L3_CACHE, + .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), + .schema_fmt = RESCTRL_SCHEMA_RANGE, }, }, }; +u32 resctrl_arch_system_num_rmid_idx(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; + + /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ + return r->num_rmid; +} + +struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) +{ + if (l >= RDT_NUM_RESOURCES) + return NULL; + + return &rdt_resources_all[l].r_resctrl; +} + /* * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs * as they do not have CPUID enumeration support for Cache allocation. @@ -146,17 +142,16 @@ static inline void cache_alloc_hsw_probe(void) struct rdt_resource *r = &hw_res->r_resctrl; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) + if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); + rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ if (l3_cbm_0 != max_cbm) return; hw_res->num_closid = 4; - r->default_ctrl = max_cbm; r->cache.cbm_len = 20; r->cache.shareable_bits = 0xc0000; r->cache.min_cbm_bits = 2; @@ -166,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.mba_sc; - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -202,7 +182,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r) return false; } -static bool __get_mem_config_intel(struct rdt_resource *r) +static __init bool __get_mem_config_intel(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); union cpuid_0x10_3_eax eax; @@ -212,7 +192,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r) cpuid_count(0x00000010, 3, &eax.full, &ebx, &ecx, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; max_delay = eax.split.max_delay + 1; - r->default_ctrl = MAX_MBA_BW; + r->membw.max_bw = MAX_MBA_BW; r->membw.arch_needs_linear = true; if (ecx & MBA_IS_LINEAR) { r->membw.delay_linear = true; @@ -223,20 +203,18 @@ static bool __get_mem_config_intel(struct rdt_resource *r) return false; r->membw.arch_needs_linear = false; } - r->data_width = 3; if (boot_cpu_has(X86_FEATURE_PER_THREAD_MBA)) r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD; else r->membw.throttle_mode = THREAD_THROTTLE_MAX; - thread_throttle_mode_init(); r->alloc_capable = true; return true; } -static bool __rdt_get_mem_config_amd(struct rdt_resource *r) +static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); u32 eax, ebx, ecx, edx, subleaf; @@ -249,7 +227,7 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx); hw_res->num_closid = edx + 1; - r->default_ctrl = 1 << eax; + r->membw.max_bw = 1 << eax; /* AMD does not use delay */ r->membw.delay_linear = false; @@ -262,8 +240,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r) r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED; r->membw.min_bw = 0; r->membw.bw_gran = 1; - /* Max value is 2048, Data width should be 4 in decimal */ - r->data_width = 4; r->alloc_capable = true; @@ -276,14 +252,13 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) union cpuid_0x10_1_eax eax; union cpuid_0x10_x_ecx ecx; union cpuid_0x10_x_edx edx; - u32 ebx; + u32 ebx, default_ctrl; cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx.full, &edx.full); hw_res->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; - r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; - r->cache.shareable_bits = ebx & r->default_ctrl; - r->data_width = (r->cache.cbm_len + 3) / 4; + default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & default_ctrl; if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) r->cache.arch_has_sparse_bitmasks = ecx.split.noncont; r->alloc_capable = true; @@ -309,15 +284,14 @@ static void rdt_get_cdp_l2_config(void) rdt_get_cdp_config(RDT_RESOURCE_L2); } -static void -mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void mba_wrmsr_amd(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -331,44 +305,28 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) return MAX_MBA_BW - bw; pr_warn_once("Non Linear delay-bw map not supported but queried\n"); - return r->default_ctrl; + return MAX_MBA_BW; } -static void -mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r) +static void mba_wrmsr_intel(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r)); + wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } -static void -cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r) +static void cat_wrmsr(struct msr_param *m) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom); + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); unsigned int i; - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); -} - -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) -{ - struct rdt_domain *d; - - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) - return d; - } - - return NULL; + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } u32 resctrl_arch_get_num_closid(struct rdt_resource *r) @@ -378,52 +336,11 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r) void rdt_ctrl_update(void *arg) { + struct rdt_hw_resource *hw_res; struct msr_param *m = arg; - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res); - struct rdt_resource *r = m->res; - int cpu = smp_processor_id(); - struct rdt_domain *d; - d = get_domain_from_cpu(cpu, r); - if (d) { - hw_res->msr_update(d, m, r); - return; - } - pr_warn_once("cpu %d not found in any domain for resource %s\n", - cpu, r->name); -} - -/* - * rdt_find_domain - Find a domain in a resource that matches input resource id - * - * Search resource r's domain list to find the resource id. If the resource - * id is found in a domain, return the domain. Otherwise, if requested by - * caller, return the first domain whose id is bigger than the input id. - * The domain list is sorted by id in ascending order. - */ -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) -{ - struct rdt_domain *d; - struct list_head *l; - - if (id < 0) - return ERR_PTR(-ENODEV); - - list_for_each(l, &r->domains) { - d = list_entry(l, struct rdt_domain, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; + hw_res = resctrl_to_arch_res(m->res); + hw_res->msr_update(m); } static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) @@ -437,21 +354,26 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc) * For Memory Allocation: Set b/w requested to 100% */ for (i = 0; i < hw_res->num_closid; i++, dc++) - *dc = r->default_ctrl; + *dc = resctrl_get_default_ctrl(r); } -static void domain_free(struct rdt_hw_domain *hw_dom) +static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) +{ + kfree(hw_dom->ctrl_val); + kfree(hw_dom); +} + +static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { kfree(hw_dom->arch_mbm_total); kfree(hw_dom->arch_mbm_local); - kfree(hw_dom->ctrl_val); kfree(hw_dom); } -static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) +static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct msr_param m; u32 *dc; @@ -463,9 +385,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) hw_dom->ctrl_val = dc; setup_default_ctrlval(r, dc); + m.res = r; + m.dom = d; m.low = 0; m.high = hw_res->num_closid; - hw_res->msr_update(d, &m, r); + hw_res->msr_update(&m); return 0; } @@ -474,17 +398,17 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) * @num_rmid: The size of the MBM counter array * @hw_dom: The domain that owns the allocated arrays */ -static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) +static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (is_mbm_total_enabled()) { + if (resctrl_arch_is_mbm_total_enabled()) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (is_mbm_local_enabled()) { + if (resctrl_arch_is_mbm_local_enabled()) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { @@ -497,37 +421,45 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom) return 0; } -/* - * domain_add_cpu - Add a cpu to a resource's domain list. - * - * If an existing domain in the resource r's domain list matches the cpu's - * resource id, add the cpu in the domain. - * - * Otherwise, a new domain is allocated and inserted into the right position - * in the domain list sorted by id in ascending order. - * - * The order in the domain list is visible to users when we print entries - * in the schemata file and schemata input is validated to have the same order - * as this list. - */ -static void domain_add_cpu(int cpu, struct rdt_resource *r) +static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); + switch (scope) { + case RESCTRL_L2_CACHE: + case RESCTRL_L3_CACHE: + return get_cpu_cacheinfo_id(cpu, scope); + case RESCTRL_L3_NODE: + return cpu_to_node(cpu); + default: + break; + } + + return -EINVAL; +} + +static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); + struct rdt_hw_ctrl_domain *hw_dom; struct list_head *add_pos = NULL; - struct rdt_hw_domain *hw_dom; - struct rdt_domain *d; + struct rdt_domain_hdr *hdr; + struct rdt_ctrl_domain *d; int err; lockdep_assert_held(&domain_list_lock); - d = rdt_find_domain(r, id, &add_pos); - if (IS_ERR(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); + if (id < 0) { + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); return; } - if (d) { - cpumask_set_cpu(cpu, &d->cpu_mask); + hdr = resctrl_find_domain(&r->ctrl_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + d = container_of(hdr, struct rdt_ctrl_domain, hdr); + + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); if (r->cache.arch_has_per_cpu_cfg) rdt_domain_reconfigure_cdp(r); return; @@ -538,64 +470,189 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) return; d = &hw_dom->d_resctrl; - d->id = id; - cpumask_set_cpu(cpu, &d->cpu_mask); + d->hdr.id = id; + d->hdr.type = RESCTRL_CTRL_DOMAIN; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); rdt_domain_reconfigure_cdp(r); - if (r->alloc_capable && domain_setup_ctrlval(r, d)) { - domain_free(hw_dom); + if (domain_setup_ctrlval(r, d)) { + ctrl_domain_free(hw_dom); return; } - if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { - domain_free(hw_dom); + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_ctrl_domain(r, d); + if (err) { + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + ctrl_domain_free(hw_dom); + } +} + +static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct list_head *add_pos = NULL; + struct rdt_hw_mon_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; + struct cacheinfo *ci; + int err; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, &add_pos); + if (hdr) { + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + d = container_of(hdr, struct rdt_mon_domain, hdr); + + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); return; } - list_add_tail_rcu(&d->list, add_pos); + hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu)); + if (!hw_dom) + return; - err = resctrl_online_domain(r, d); + d = &hw_dom->d_resctrl; + d->hdr.id = id; + d->hdr.type = RESCTRL_MON_DOMAIN; + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { + pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); + mon_domain_free(hw_dom); + return; + } + d->ci_id = ci->id; + cpumask_set_cpu(cpu, &d->hdr.cpu_mask); + + arch_mon_domain_online(r, d); + + if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + mon_domain_free(hw_dom); + return; + } + + list_add_tail_rcu(&d->hdr.list, add_pos); + + err = resctrl_online_mon_domain(r, d); if (err) { - list_del_rcu(&d->list); + list_del_rcu(&d->hdr.list); synchronize_rcu(); - domain_free(hw_dom); + mon_domain_free(hw_dom); } } -static void domain_remove_cpu(int cpu, struct rdt_resource *r) +static void domain_add_cpu(int cpu, struct rdt_resource *r) { - int id = get_cpu_cacheinfo_id(cpu, r->cache_level); - struct rdt_hw_domain *hw_dom; - struct rdt_domain *d; + if (r->alloc_capable) + domain_add_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_add_cpu_mon(cpu, r); +} + +static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->ctrl_scope); + struct rdt_hw_ctrl_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_ctrl_domain *d; lockdep_assert_held(&domain_list_lock); - d = rdt_find_domain(r, id, NULL); - if (IS_ERR_OR_NULL(d)) { - pr_warn("Couldn't find cache id for CPU %d\n", cpu); + if (id < 0) { + pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->ctrl_scope, r->name); return; } - hw_dom = resctrl_to_arch_dom(d); - cpumask_clear_cpu(cpu, &d->cpu_mask); - if (cpumask_empty(&d->cpu_mask)) { - resctrl_offline_domain(r, d); - list_del_rcu(&d->list); + hdr = resctrl_find_domain(&r->ctrl_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); + return; + } + + if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_ctrl_domain, hdr); + hw_dom = resctrl_to_arch_ctrl_dom(d); + + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { + resctrl_offline_ctrl_domain(r, d); + list_del_rcu(&d->hdr.list); synchronize_rcu(); /* - * rdt_domain "d" is going to be freed below, so clear + * rdt_ctrl_domain "d" is going to be freed below, so clear * its pointer from pseudo_lock_region struct. */ if (d->plr) d->plr->d = NULL; - domain_free(hw_dom); + ctrl_domain_free(hw_dom); + + return; + } +} + +static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r) +{ + int id = get_domain_id_from_scope(cpu, r->mon_scope); + struct rdt_hw_mon_domain *hw_dom; + struct rdt_domain_hdr *hdr; + struct rdt_mon_domain *d; + + lockdep_assert_held(&domain_list_lock); + + if (id < 0) { + pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n", + cpu, r->mon_scope, r->name); + return; + } + + hdr = resctrl_find_domain(&r->mon_domains, id, NULL); + if (!hdr) { + pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n", + id, cpu, r->name); + return; + } + + if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) + return; + + d = container_of(hdr, struct rdt_mon_domain, hdr); + hw_dom = resctrl_to_arch_mon_dom(d); + + cpumask_clear_cpu(cpu, &d->hdr.cpu_mask); + if (cpumask_empty(&d->hdr.cpu_mask)) { + resctrl_offline_mon_domain(r, d); + list_del_rcu(&d->hdr.list); + synchronize_rcu(); + mon_domain_free(hw_dom); return; } } +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + if (r->alloc_capable) + domain_remove_cpu_ctrl(cpu, r); + if (r->mon_capable) + domain_remove_cpu_mon(cpu, r); +} + static void clear_closid_rmid(int cpu) { struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); @@ -639,20 +696,6 @@ static int resctrl_arch_offline_cpu(unsigned int cpu) return 0; } -/* - * Choose a width for the resource name and resource data based on the - * resource that has widest name and cbm. - */ -static __init void rdt_init_padding(void) -{ - struct rdt_resource *r; - - for_each_alloc_capable_rdt_resource(r) { - if (r->data_width > max_data_width) - max_data_width = r->data_width; - } -} - enum { RDT_FLAG_CMT, RDT_FLAG_MBM_TOTAL, @@ -678,7 +721,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -718,7 +761,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -738,6 +781,21 @@ bool __init rdt_cpu_has(int flag) return ret; } +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +{ + if (!rdt_cpu_has(X86_FEATURE_BMEC)) + return false; + + switch (evt) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL); + case QOS_L3_MBM_LOCAL_EVENT_ID: + return rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL); + default: + return false; + } +} + static __init bool get_mem_config(void) { struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_MBA]; @@ -821,18 +879,18 @@ static __init bool get_rdt_mon_resources(void) static __init void __check_quirks_intel(void) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_HASWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_HASWELL_X: if (!rdt_options[RDT_FLAG_L3_CAT].force_off) cache_alloc_hsw_probe(); break; - case INTEL_FAM6_SKYLAKE_X: + case INTEL_SKYLAKE_X: if (boot_cpu_data.x86_stepping <= 4) set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); else set_rdt_options("!l3cat"); fallthrough; - case INTEL_FAM6_BROADWELL_X: + case INTEL_BROADWELL_X: intel_rdt_mbm_apply_quirk(); break; } @@ -934,10 +992,14 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) } } -static int __init resctrl_late_init(void) +static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different @@ -950,8 +1012,6 @@ static int __init resctrl_late_init(void) if (!get_rdt_resources()) return -ENODEV; - rdt_init_padding(); - state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/resctrl/cat:online:", resctrl_arch_online_cpu, @@ -959,7 +1019,7 @@ static int __init resctrl_late_init(void) if (state < 0) return state; - ret = rdtgroup_init(); + ret = resctrl_init(); if (ret) { cpuhp_remove_state(state); return ret; @@ -975,18 +1035,13 @@ static int __init resctrl_late_init(void) return 0; } -late_initcall(resctrl_late_init); +late_initcall(resctrl_arch_late_init); -static void __exit resctrl_exit(void) +static void __exit resctrl_arch_exit(void) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - cpuhp_remove_state(rdt_online); - rdtgroup_exit(); - - if (r->mon_capable) - rdt_put_mon_l3_config(); + resctrl_exit(); } -__exitcall(resctrl_exit); +__exitcall(resctrl_arch_exit); diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 7997b47743a2..1189c0df4ad7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,295 +16,27 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) -{ - unsigned long bw; - int ret; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtoul(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf); - return false; - } - - if ((bw < r->membw.min_bw || bw > r->default_ctrl) && - !is_mba_sc(r)) { - rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw, - r->membw.min_bw, r->default_ctrl); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - unsigned long bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - unsigned long first_bit, zero_bit, val; - unsigned int cbm_len = r->cache.cbm_len; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > r->default_ctrl) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - char *dom = NULL, *id; - struct rdt_domain *d; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (r->parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - -static u32 get_config_index(u32 closid, enum resctrl_conf_type type) -{ - switch (type) { - default: - case CDP_NONE: - return closid; - case CDP_CODE: - return closid * 2 + 1; - case CDP_DATA: - return closid * 2; - } -} - -static bool apply_config(struct rdt_hw_domain *hw_dom, - struct resctrl_staged_config *cfg, u32 idx, - cpumask_var_t cpu_mask) -{ - struct rdt_domain *dom = &hw_dom->d_resctrl; - - if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) { - cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask); - hw_dom->ctrl_val[idx] = cfg->new_ctrl; - - return true; - } - - return false; -} - -int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - u32 idx = get_config_index(closid, t); + u32 idx = resctrl_get_config_index(closid, t); struct msr_param msr_param; - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) + if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask)) return -EINVAL; hw_dom->ctrl_val[idx] = cfg_val; msr_param.res = r; + msr_param.dom = d; msr_param.low = idx; msr_param.high = idx + 1; - hw_res->msr_update(d, &msr_param, r); + hw_res->msr_update(&msr_param); return 0; } @@ -312,310 +44,50 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d, int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) { struct resctrl_staged_config *cfg; - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; + struct rdt_ctrl_domain *d; enum resctrl_conf_type t; - cpumask_var_t cpu_mask; - struct rdt_domain *d; u32 idx; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - - msr_param.res = NULL; - list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + hw_dom = resctrl_to_arch_ctrl_dom(d); + msr_param.res = NULL; for (t = 0; t < CDP_NUM_TYPES; t++) { cfg = &hw_dom->d_resctrl.staged_config[t]; if (!cfg->have_new_ctrl) continue; - idx = get_config_index(closid, t); - if (!apply_config(hw_dom, cfg, idx, cpu_mask)) + idx = resctrl_get_config_index(closid, t); + if (cfg->new_ctrl == hw_dom->ctrl_val[idx]) continue; + hw_dom->ctrl_val[idx] = cfg->new_ctrl; if (!msr_param.res) { msr_param.low = idx; msr_param.high = msr_param.low + 1; msr_param.res = r; + msr_param.dom = d; } else { msr_param.low = min(msr_param.low, idx); msr_param.high = max(msr_param.high, idx + 1); } } + if (msr_param.res) + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - if (cpumask_empty(cpu_mask)) - goto done; - - /* Update resource control msr on all the CPUs. */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - -done: - free_cpumask_var(cpu_mask); - return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d, +u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); - u32 idx = get_config_index(closid, type); + struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d); + u32 idx = resctrl_get_config_index(closid, type); return hw_dom->ctrl_val[idx]; } - -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->domains, list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, r->format_str, dom->id, max_data_width, - ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; -} - -static int smp_mon_event_count(void *arg) -{ - mon_event_count(arg); - - return 0; -} - -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first) -{ - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->val = 0; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(&d->cpu_mask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); -} - -int rdtgroup_mondata_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - struct rdt_domain *d; - struct rmid_read rr; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - - r = &rdt_resources_all[resid].r_resctrl; - d = rdt_find_domain(r, domid, NULL); - if (IS_ERR_OR_NULL(d)) { - ret = -ENOENT; - goto out; - } - - mon_event_read(&rr, r, d, rdtgrp, evtid, false); - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; -} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index c99f26ebe7a6..5e3c41b36437 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -32,315 +25,6 @@ */ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) -/* Reads to Local DRAM Memory */ -#define READS_TO_LOCAL_MEM BIT(0) - -/* Reads to Remote DRAM Memory */ -#define READS_TO_REMOTE_MEM BIT(1) - -/* Non-Temporal Writes to Local Memory */ -#define NON_TEMP_WRITE_TO_LOCAL_MEM BIT(2) - -/* Non-Temporal Writes to Remote Memory */ -#define NON_TEMP_WRITE_TO_REMOTE_MEM BIT(3) - -/* Reads to Local Memory the system identifies as "Slow Memory" */ -#define READS_TO_LOCAL_S_MEM BIT(4) - -/* Reads to Remote Memory the system identifies as "Slow Memory" */ -#define READS_TO_REMOTE_S_MEM BIT(5) - -/* Dirty Victims to All Types of Memory */ -#define DIRTY_VICTIMS_TO_ALL_MEM BIT(6) - -/* Max event bits supported */ -#define MAX_EVT_CONFIG_BITS GENMASK(6, 0) - -/** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file - * @priv: Used to store monitoring event data in @u - * as kernfs private data - * @rid: Resource id associated with the event file - * @evtid: Event id associated with the event file - * @domid: The domain to which the event file belongs - * @u: Name of the bit fields struct - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 8; - unsigned int domid : 14; - } u; -}; - -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_domain *d; - enum resctrl_event_id evtid; - bool first; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern unsigned int rdt_mon_features; -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct pseudo_lock_region - pseudo-lock region information - * @s: Resctrl schema for the resource to which this - * pseudo-locked region belongs - * @d: RDT domain to which this pseudo-locked region - * belongs - * @cbm: bitmask of the pseudo-locked region - * @lock_thread_wq: waitqueue used to wait on the pseudo-locking thread - * completion - * @thread_done: variable used by waitqueue to test if pseudo-locking - * thread completed - * @cpu: core associated with the cache on which the setup code - * will be run - * @line_size: size of the cache lines - * @size: size of pseudo-locked region in bytes - * @kmem: the kernel memory associated with pseudo-locked region - * @minor: minor number of character device associated with this - * region - * @debugfs_dir: pointer to this region's directory in the debugfs - * filesystem - * @pm_reqs: Power management QoS requests related to this region - */ -struct pseudo_lock_region { - struct resctrl_schema *s; - struct rdt_domain *d; - u32 cbm; - wait_queue_head_t lock_thread_wq; - int thread_done; - int cpu; - unsigned int line_size; - unsigned int size; - void *kmem; - unsigned int minor; - struct dentry *debugfs_dir; - struct list_head pm_reqs; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width, max_data_width; - -int __init rdtgroup_init(void); -void __exit rdtgroup_exit(void); - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - /** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. @@ -354,70 +38,57 @@ struct arch_mbm_state { }; /** - * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share - * a resource + * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share + * a resource for a control function * @d_resctrl: Properties exposed to the resctrl file system * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * + * Members of this structure are accessed via helpers that provide abstraction. + */ +struct rdt_hw_ctrl_domain { + struct rdt_ctrl_domain d_resctrl; + u32 *ctrl_val; +}; + +/** + * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share + * a resource for a monitor function + * @d_resctrl: Properties exposed to the resctrl file system * @arch_mbm_total: arch private state for MBM total bandwidth * @arch_mbm_local: arch private state for MBM local bandwidth * * Members of this structure are accessed via helpers that provide abstraction. */ -struct rdt_hw_domain { - struct rdt_domain d_resctrl; - u32 *ctrl_val; +struct rdt_hw_mon_domain { + struct rdt_mon_domain d_resctrl; struct arch_mbm_state *arch_mbm_total; struct arch_mbm_state *arch_mbm_local; }; -static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r) +static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) { - return container_of(r, struct rdt_hw_domain, d_resctrl); + return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl); +} + +static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r) +{ + return container_of(r, struct rdt_hw_mon_domain, d_resctrl); } /** * struct msr_param - set a range of MSRs from a domain * @res: The resource to use + * @dom: The domain to update * @low: Beginning index from base MSR * @high: End index */ struct msr_param { struct rdt_resource *res; + struct rdt_ctrl_domain *dom; u32 low; u32 high; }; -static inline bool is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - -static inline bool is_mbm_enabled(void) -{ - return (is_mbm_total_enabled() || is_mbm_local_enabled()); -} - -static inline bool is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - /** * struct rdt_hw_resource - arch private attributes of a resctrl resource * @r_resctrl: Attributes of the resource used directly by resctrl. @@ -430,8 +101,6 @@ struct rdt_parse_data { * @msr_update: Function pointer to update QOS MSRs * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth - * Monitoring Event Configuration (BMEC) is supported. * @cdp_enabled: CDP state of this resource * * Members of this structure are either private to the architecture @@ -442,11 +111,9 @@ struct rdt_hw_resource { struct rdt_resource r_resctrl; u32 num_closid; unsigned int msr_base; - void (*msr_update) (struct rdt_domain *d, struct msr_param *m, - struct rdt_resource *r); + void (*msr_update)(struct msr_param *m); unsigned int mon_scale; unsigned int mbm_width; - unsigned int mbm_cfg_mask; bool cdp_enabled; }; @@ -455,62 +122,9 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); -int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_domain *d); - -extern struct mutex rdtgroup_mutex; - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; - -enum resctrl_res_level { - RDT_RESOURCE_L3, - RDT_RESOURCE_L2, - RDT_RESOURCE_MBA, - RDT_RESOURCE_SMBA, - /* Must be the last */ - RDT_NUM_RESOURCES, -}; - -static inline struct rdt_resource *resctrl_inc(struct rdt_resource *res) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(res); - - hw_res++; - return &hw_res->r_resctrl; -} - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); - -/* - * To return the common struct rdt_resource, which is contained in struct - * rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource. - */ -#define for_each_rdt_resource(r) \ - for (r = &rdt_resources_all[0].r_resctrl; \ - r <= &rdt_resources_all[RDT_NUM_RESOURCES - 1].r_resctrl; \ - r = resctrl_inc(r)) - -#define for_each_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable || r->mon_capable) - -#define for_each_alloc_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->alloc_capable) - -#define for_each_mon_capable_rdt_resource(r) \ - for_each_rdt_resource(r) \ - if (r->mon_capable) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ union cpuid_0x10_1_eax { @@ -545,66 +159,14 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); - void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); + int rdt_get_mon_l3_config(struct rdt_resource *r); -void __exit rdt_put_mon_l3_config(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_domain *d, struct rdtgroup *rdtgrp, - int evtid, int first); -void mbm_setup_overflow_handler(struct rdt_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); + +bool rdt_cpu_has(int flag); + void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_domain *d); -void __check_limbo(struct rdt_domain *d, bool force_free); + void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void __init thread_throttle_mode_init(void); -void __init mbm_config_rftype_init(const char *config); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c34a35ec0f03..c261558276cd 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -15,63 +15,16 @@ * Software Developer Manual June 2016, volume 3, section 17.17. */ +#define pr_fmt(fmt) "resctrl: " fmt + #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; - /* * Global boolean for rdt_monitor which is true if any * resource monitoring is enabled. @@ -83,21 +36,12 @@ bool rdt_mon_capable; */ unsigned int rdt_mon_features; -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) +static int snc_nodes_per_l3_cache = 1; + /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -146,6 +90,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -158,33 +103,42 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by + * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is + * needed. The physical RMID is the same as the logical RMID. + * + * On a platform with SNC mode enabled, Linux enables RMID sharing mode + * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel + * Resource Director Technology Architecture Specification" for a full + * description of RMID sharing mode). * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. + * In RMID sharing mode there are fewer "logical RMID" values available + * to accumulate data ("physical RMIDs" are divided evenly between SNC + * nodes that share an L3 cache). Linux creates an rdt_mon_domain for + * each SNC node. + * + * The value loaded into IA32_PQR_ASSOC is the "logical RMID". + * + * Data is collected independently on each SNC node and can be retrieved + * using the "physical RMID" value computed by this function and loaded + * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node. + * + * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3 + * cache. So a "physical RMID" may be read from any CPU that shares + * the L3 cache with the desired SNC node, not just from a CPU in + * the specific SNC node. */ -static inline struct rmid_entry *__rmid_entry(u32 idx) +static int logical_rmid_to_physical_rmid(int cpu, int lrmid) { - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); + if (snc_nodes_per_l3_cache == 1) + return lrmid; - return entry; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; } -static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) { u64 msr_val; @@ -196,8 +150,8 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) * are error bits. */ - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); + wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); + rdmsrq(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) return -EIO; @@ -208,7 +162,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) return 0; } -static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, +static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) { @@ -219,27 +173,29 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, return &hw_dom->arch_mbm_total[rmid]; case QOS_L3_MBM_LOCAL_EVENT_ID: return &hw_dom->arch_mbm_local[rmid]; + default: + /* Never expect to get here */ + WARN_ON_ONCE(1); + return NULL; } - - /* Never expect to get here */ - WARN_ON_ONCE(1); - - return NULL; } -void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, +void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; + u32 prmid; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { memset(am, 0, sizeof(*am)); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); /* Record any initial, non-zero count value. */ - __rmid_read(rmid, eventid, &am->prev_msr); + __rmid_read_phys(prmid, eventid, &am->prev_msr); } } @@ -247,15 +203,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, * Assumes that hardware counters are also reset and thus that there is * no need to record initial non-zero counts. */ -void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d) +void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (is_mbm_total_enabled()) + if (resctrl_arch_is_mbm_total_enabled()) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (is_mbm_local_enabled()) + if (resctrl_arch_is_mbm_local_enabled()) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } @@ -268,22 +224,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >> shift; } -int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, +int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, u32 unused, u32 rmid, enum resctrl_event_id eventid, u64 *val, void *ignored) { + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); + int cpu = cpumask_any(&d->hdr.cpu_mask); struct arch_mbm_state *am; u64 msr_val, chunks; + u32 prmid; int ret; resctrl_arch_rmid_read_context_check(); - if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) - return -EINVAL; - - ret = __rmid_read(rmid, eventid, &msr_val); + prmid = logical_rmid_to_physical_rmid(cpu, rmid); + ret = __rmid_read_phys(prmid, eventid, &msr_val); if (ret) return ret; @@ -302,705 +258,87 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, return 0; } -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - /* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID + * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 + * which indicates that RMIDs are configured in legacy mode. + * This mode is incompatible with Linux resctrl semantics + * as RMIDs are partitioned between SNC nodes, which requires + * a user to know which RMID is allocated to a task. + * Clearing bit 0 reconfigures the RMID counters for use + * in RMID sharing mode. This mode is better for Linux. + * The RMID space is divided between all SNC nodes with the + * RMIDs renumbered to start from zero in each node when + * counting operations from tasks. Code to read the counters + * must adjust RMID counter numbers based on SNC node. See + * logical_rmid_to_physical_rmid() for code that does this. */ -void __check_limbo(struct rdt_domain *d, bool force_free) +void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; + if (snc_nodes_per_l3_cache > 1) + msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); } -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdt_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->domains, list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - struct mbm_state *m; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, rr->evtid, - &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct mbm_state *m = &rr->d->mbm_local[idx]; - u64 cur_bw, bytes, cur_bytes; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} +/* CPU models that support MSR_RMID_SNC_CONFIG */ +static const struct x86_cpu_id snc_cpu_ids[] __initconst = { + X86_MATCH_VFM(INTEL_ICELAKE_X, 0), + X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0), + X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0), + {} +}; /* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. + * There isn't a simple hardware bit that indicates whether a CPU is running + * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the + * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in + * the same NUMA node as CPU0. + * It is not possible to accurately determine SNC state if the system is + * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes + * to L3 caches. It will be OK if system is booted with hyperthreading + * disabled (since this doesn't affect the ratio). */ -void mon_event_count(void *info) +static __init int snc_get_config(void) { - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; + struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); + const cpumask_t *node0_cpumask; + int cpus_per_node, cpus_per_l3; int ret; - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_resource *r_mba; - struct rdt_domain *dom_mba; - u32 cur_bw, user_bw, idx; - struct list_head *head; - struct rdtgroup *entry; - - if (!is_mbm_local_enabled()) - return; - - r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - idx = resctrl_arch_rmid_idx_encode(closid, rmid); - pmbm_data = &dom_mbm->mbm_local[idx]; - - dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid]; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, - u32 closid, u32 rmid) -{ - struct rmid_read rr; - - rr.first = false; - rr.r = r; - rr.d = d; - - /* - * This is protected from concurrent reads from user - * as both the user and we hold the global mutex. - */ - if (is_mbm_total_enabled()) { - rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } - if (is_mbm_local_enabled()) { - rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; - rr.val = 0; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * Call the MBA software controller only for the - * control groups and when user has enabled - * the software controller explicitly. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); - } -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_domain *d; + if (!x86_match_cpu(snc_cpu_ids) || !ci) + return 1; cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_domain, cqm_limbo.work); - - __check_limbo(d, false); - - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); - } - - mutex_unlock(&rdtgroup_mutex); + if (num_online_cpus() != num_present_cpus()) + pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); cpus_read_unlock(); -} -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; + node0_cpumask = cpumask_of_node(cpu_to_node(0)); - cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; + cpus_per_node = cpumask_weight(node0_cpumask); + cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); -} + if (!cpus_per_node || !cpus_per_l3) + return 1; -void mbm_handle_overflow(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - struct rdt_resource *r; - struct rdt_domain *d; + ret = cpus_per_l3 / cpus_per_node; - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - d = container_of(work, struct rdt_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } - - /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. - */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); -} - -static int dom_data_init(struct rdt_resource *r) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); - - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); - } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in rdtgroup_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -static void __exit dom_data_exit(void) -{ - mutex_lock(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; + /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ + switch (ret) { + case 1: + break; + case 2 ... 4: + case 6: + pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret); + rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE; + break; + default: + pr_warn("Ignore improbable SNC node count %d\n", ret); + ret = 1; + break; } - kfree(rmid_ptrs); - rmid_ptrs = NULL; - - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + return ret; } int __init rdt_get_mon_l3_config(struct rdt_resource *r) @@ -1008,11 +346,12 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; - int ret; + + snc_nodes_per_l3_cache = snc_get_config(); resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; - hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale; - r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; + r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -1036,39 +375,19 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - ret = dom_data_init(r); - if (ret) - return ret; - if (rdt_cpu_has(X86_FEATURE_BMEC)) { u32 eax, ebx, ecx, edx; /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; - - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { - mbm_total_event.configurable = true; - mbm_config_rftype_init("mbm_total_bytes_config"); - } - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { - mbm_local_event.configurable = true; - mbm_config_rftype_init("mbm_local_bytes_config"); - } + r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } - l3_mon_evt_init(r); - r->mon_capable = true; return 0; } -void __exit rdt_put_mon_l3_config(void) -{ - dom_data_exit(); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 884b88e25141..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,27 +11,22 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cacheinfo.h> +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> -#include <asm/intel-family.h> -#include <asm/resctrl.h> +#include <asm/cpu_device_id.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "internal.h" #define CREATE_TRACE_POINTS -#include "pseudo_lock_event.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -39,30 +34,9 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** - * get_prefetch_disable_bits - prefetch disable bits of supported platforms + * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported + * platforms * @void: It takes no parameters. * * Capture the list of platforms that have been validated to support @@ -76,20 +50,22 @@ static const struct class pseudo_lock_class = { * in the SDM. * * When adding a platform here also add support for its cache events to - * measure_cycles_perf_fn() + * resctrl_arch_measure_l*_residency() * * Return: * If platform is supported, the bits to disable hardware prefetchers, 0 * if platform is not supported. */ -static u64 get_prefetch_disable_bits(void) +u64 resctrl_arch_get_prefetch_disable_bits(void) { + prefetch_disable_bits = 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) return 0; - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -99,9 +75,10 @@ static u64 get_prefetch_disable_bits(void) * 3 DCU IP Prefetcher Disable (R/W) * 63:4 Reserved */ - return 0xF; - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + prefetch_disable_bits = 0xF; + break; + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: /* * SDM defines bits of MSR_MISC_FEATURE_CONTROL register * as: @@ -110,308 +87,16 @@ static u64 get_prefetch_disable_bits(void) * 2 DCU Hardware Prefetcher Disable (R/W) * 63:3 Reserved */ - return 0x5; - } - - return 0; -} - -/** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - struct cpu_cacheinfo *ci; - int ret; - int i; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo(plr->cpu); - - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == plr->s->res->cache_level) { - plr->line_size = ci->info_list[i].coherency_line_size; - return 0; - } - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; + prefetch_disable_bits = 0x5; + break; } - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; + return prefetch_disable_bits; } /** - * pseudo_lock_fn - Load kernel memory into cache - * @_rdtgrp: resource group to which pseudo-lock region belongs + * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache + * @_plr: the pseudo-lock region descriptor * * This is the core pseudo-locking flow. * @@ -428,10 +113,9 @@ static void pseudo_lock_free(struct rdtgroup *rdtgrp) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int pseudo_lock_fn(void *_rdtgrp) +int resctrl_arch_pseudo_lock_fn(void *_plr) { - struct rdtgroup *rdtgrp = _rdtgrp; - struct pseudo_lock_region *plr = rdtgrp->plr; + struct pseudo_lock_region *plr = _plr; u32 rmid_p, closid_p; unsigned long i; u64 saved_msr; @@ -461,7 +145,7 @@ static int pseudo_lock_fn(void *_rdtgrp) * increase likelihood that allocated cache portion will be filled * with associated memory. */ - native_wbinvd(); + wbinvd(); /* * Always called with interrupts enabled. By disabling interrupts @@ -478,8 +162,8 @@ static int pseudo_lock_fn(void *_rdtgrp) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ - saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); + native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); mem_r = plr->kmem; @@ -491,7 +175,8 @@ static int pseudo_lock_fn(void *_rdtgrp) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, rdtgrp->closid); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + /* * Cache was flushed earlier. Now access kernel memory to read it * into cache region associated with just activated plr->closid. @@ -527,10 +212,10 @@ static int pseudo_lock_fn(void *_rdtgrp) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -539,342 +224,8 @@ static int pseudo_lock_fn(void *_rdtgrp) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - prefetch_disable_bits = get_prefetch_disable_bits(); - if (prefetch_disable_bits == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) -{ - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - struct rdt_domain *d_i; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->domains, list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** - * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory + * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read + * pseudo-locked memory * @_plr: pseudo-lock region to measure * * There is no deterministic way to test if a memory region is cached. One @@ -887,7 +238,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d) * * Return: 0. Waiter on waitqueue will be woken on completion. */ -static int measure_cycles_lat_fn(void *_plr) +int resctrl_arch_measure_cycles_lat_fn(void *_plr) { struct pseudo_lock_region *plr = _plr; u32 saved_low, saved_high; @@ -900,7 +251,7 @@ static int measure_cycles_lat_fn(void *_plr) * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed @@ -996,7 +347,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); /* Initialize rest of local variables */ /* @@ -1014,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * used in L1 cache, second to capture accurate value that does not * include cache misses incurred because of instruction loads. */ - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * From SDM: Performing back-to-back fast reads are not guaranteed * to be monotonic. @@ -1023,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1046,8 +397,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_after); - rdpmcl(miss_pmcnum, miss_after); + hits_after = rdpmc(hit_pmcnum); + miss_after = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1071,7 +422,7 @@ out: return 0; } -static int measure_l2_residency(void *_plr) +int resctrl_arch_measure_l2_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1084,9 +435,9 @@ static int measure_l2_residency(void *_plr) * L2_HIT 02H * L2_MISS 10H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_ATOM_GOLDMONT: - case INTEL_FAM6_ATOM_GOLDMONT_PLUS: + switch (boot_cpu_data.x86_vfm) { + case INTEL_ATOM_GOLDMONT: + case INTEL_ATOM_GOLDMONT_PLUS: perf_miss_attr.config = X86_CONFIG(.event = 0xd1, .umask = 0x10); perf_hit_attr.config = X86_CONFIG(.event = 0xd1, @@ -1109,7 +460,7 @@ out: return 0; } -static int measure_l3_residency(void *_plr) +int resctrl_arch_measure_l3_residency(void *_plr) { struct pseudo_lock_region *plr = _plr; struct residency_counts counts = {0}; @@ -1123,8 +474,8 @@ static int measure_l3_residency(void *_plr) * MISS 41H */ - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_BROADWELL_X: + switch (boot_cpu_data.x86_vfm) { + case INTEL_BROADWELL_X: /* On BDW the hit event counts references, not hits */ perf_hit_attr.config = X86_CONFIG(.event = 0x2e, .umask = 0x4f); @@ -1142,7 +493,7 @@ static int measure_l3_residency(void *_plr) */ counts.miss_after -= counts.miss_before; - if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) { + if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) { /* * On BDW references and misses are counted, need to adjust. * Sometimes the "hits" counter is a bit more than the @@ -1164,441 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_create_on_node(measure_cycles_lat_fn, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else if (sel == 2) - thread = kthread_create_on_node(measure_l2_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else if (sel == 3) - thread = kthread_create_on_node(measure_l3_residency, plr, - cpu_to_node(cpu), - "pseudo_lock_measure/%u", - cpu); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - kthread_bind(thread, cpu); - wake_up_process(thread); - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - - plr->thread_done = 0; - - thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp, - cpu_to_node(plr->cpu), - "pseudo_lock/%u", plr->cpu); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - kthread_bind(thread, plr->cpu); - wake_up_process(thread); - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name, - debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", rdtgrp->kn->name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .llseek = no_llseek, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 428ebbd4270b..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_PSEUDO_LOCK_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,9 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -#endif /* _TRACE_PSEUDO_LOCK_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE pseudo_lock_event + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 011e17efb1a6..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -12,13 +12,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/debugfs.h> #include <linux/fs.h> #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -29,324 +29,28 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_resource *r; - struct rdt_domain *dom; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->domains, list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} - -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} - -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. */ -static void update_cpu_closid_rmid(void *info) +void resctrl_arch_sync_cpu_closid_rmid(void *info) { - struct rdtgroup *r = info; + struct resctrl_cpu_defaults *r = info; if (r) { this_cpu_write(pqr_state.default_closid, r->closid); - this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + this_cpu_write(pqr_state.default_rmid, r->rmid); } /* @@ -354,1202 +58,9 @@ static void update_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - on_each_cpu_mask(cpu_mask, update_cpu_closid_rmid, r, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdtg->kn->name); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", crg->kn->name); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - - seq_printf(seq, "%u\n", s->num_closid); - return 0; -} - -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->default_ctrl); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->domains, list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - if (r->membw.throttle_mode == THREAD_THROTTLE_PER_THREAD) - seq_puts(seq, "per-thread\n"); - else - seq_puts(seq, "max\n"); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = of->kn->parent->priv; - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - struct rdt_domain *d; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->domains, list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (!strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_domain *d, unsigned long cbm) -{ - struct cpu_cacheinfo *ci; - unsigned int size = 0; - int num_b, i; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask)); - for (i = 0; i < ci->num_leaves; i++) { - if (ci->info_list[i].level == r->cache_level) { - size = ci->info_list[i].size / r->cache.cbm_len * num_b; - break; - } - } - - return size; + resctrl_arch_sched_in(current); } -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - struct rdt_domain *d; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->domains, list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct mon_config_info { - u32 evtid; - u32 mon_config; -}; - #define INVALID_CONFIG_INDEX UINT_MAX /** @@ -1574,693 +85,56 @@ static inline unsigned int mon_event_config_index_get(u32 evtid) } } -static void mon_event_config_read(void *info) +void resctrl_arch_mon_event_config_read(void *_config_info) { - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; u64 msrval; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ - mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; -} - -static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info) -{ - smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1); -} - -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) -{ - struct mon_config_info mon_info = {0}; - struct rdt_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->domains, list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct mon_config_info)); - mon_info.evtid = evtid; - mondata_config_read(dom, &mon_info); - - seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = of->kn->parent->priv; - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; + config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mon_event_config_write(void *info) +void resctrl_arch_mon_event_config_write(void *_config_info) { - struct mon_config_info *mon_info = info; + struct resctrl_mon_config_info *config_info = _config_info; unsigned int index; - index = mon_event_config_index_get(mon_info->evtid); + index = mon_event_config_index_get(config_info->evtid); if (index == INVALID_CONFIG_INDEX) { - pr_warn_once("Invalid event id %d\n", mon_info->evtid); - return; - } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0); -} - -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_domain *d, u32 evtid, u32 val) -{ - struct mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.evtid = evtid; - mondata_config_read(d, &mon_info); - if (mon_info.mon_config == val) - return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->cpu_mask, mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & hw_res->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - hw_res->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->domains, list) { - if (d->id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = of->kn->parent->priv; - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = of->kn->parent->priv; - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -void __init thread_throttle_mode_init(void) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name("thread_throttle_mode"); - if (!rft) + pr_warn_once("Invalid event id %d\n", config_info->evtid); return; - - rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB; -} - -void __init mbm_config_rftype_init(const char *config) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = r->fflags | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = r->fflags | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; + wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } static void l3_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } static void l2_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); -} - -static inline bool is_mba_linear(void) -{ - return rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl.membw.delay_linear; + wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); + struct rdt_ctrl_domain *d; struct rdt_resource *r_l; cpumask_var_t cpu_mask; - struct rdt_domain *d; int cpu; /* Walking r->domains, ensure it can't race with cpuhp */ @@ -2277,14 +151,14 @@ static int set_cache_qos_cfg(int level, bool enable) return -ENOMEM; r_l = &rdt_resources_all[level].r_resctrl; - list_for_each_entry(d, &r_l->domains, list) { + list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) { if (r_l->cache.arch_has_per_cpu_cfg) /* Pick all the CPUs in the domain instance */ - for_each_cpu(cpu, &d->cpu_mask) + for_each_cpu(cpu, &d->hdr.cpu_mask) cpumask_set_cpu(cpu, cpu_mask); else /* Pick one CPU from each domain instance to update MSR */ - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask); } /* Update QOS_CFG MSR on all the CPUs in cpu_mask */ @@ -2310,66 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - - return (is_mbm_local_enabled() && - r->alloc_capable && is_mba_linear()); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl; - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_domain *d; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - list_for_each_entry(d, &r->domains, list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2410,1754 +224,39 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || kn->parent == kn_info) - return NULL; - else - return kn->priv; - } else { - return kn->parent->priv; - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) -{ - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } -} - -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (is_mbm_enabled()) { - r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - list_for_each_entry(dom, &r->domains, list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - if (!supports_mba_mbps()) - return -EINVAL; - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); + return rdt_resources_all[l].cdp_enabled; } -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; -} - -static int reset_all_ctrls(struct rdt_resource *r) +void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) { struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); - struct rdt_hw_domain *hw_dom; + struct rdt_hw_ctrl_domain *hw_dom; struct msr_param msr_param; - cpumask_var_t cpu_mask; - struct rdt_domain *d; + struct rdt_ctrl_domain *d; int i; /* Walking r->domains, ensure it can't race with cpuhp */ lockdep_assert_cpus_held(); - if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) - return -ENOMEM; - msr_param.res = r; msr_param.low = 0; msr_param.high = hw_res->num_closid; /* * Disable resource control for this resource by setting all - * CBMs in all domains to the maximum mask value. Pick one CPU + * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU * from each domain to update the MSRs below. */ - list_for_each_entry(d, &r->domains, list) { - hw_dom = resctrl_to_arch_dom(d); - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + hw_dom = resctrl_to_arch_ctrl_dom(d); for (i = 0; i < hw_res->num_closid; i++) - hw_dom->ctrl_val[i] = r->default_ctrl; - } - - /* Update CBM on all the CPUs in cpu_mask */ - on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1); - - free_cpumask_var(cpu_mask); - - return 0; -} - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /*Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - reset_all_ctrls(r); - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups with given domain id. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - unsigned int dom_id) -{ - struct rdtgroup *prgrp, *crgrp; - char name[32]; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - sprintf(name, "mon_%s_%02d", r->name, dom_id); - kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); - } -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - union mon_data_bits priv; - struct kernfs_node *kn; - struct mon_evt *mevt; - struct rmid_read rr; - char name[32]; - int ret; - - sprintf(name, "mon_%s_%02d", r->name, d->id); - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - if (WARN_ON(list_empty(&r->evt_list))) { - ret = -EPERM; - goto out_destroy; - } - - priv.u.rid = r->rid; - priv.u.domid = d->id; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - goto out_destroy; - - if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); - } - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->domains, list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; + hw_dom->ctrl_val[i] = resctrl_get_default_ctrl(r); + msr_param.dom = d; + smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1); } - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_domain *d; - int ret; - - list_for_each_entry(d, &s->res->domains, list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_domain *d; - - list_for_each_entry(d, &r->domains, list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = r->default_ctrl; - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(kn->name, "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* - * If RDT monitoring is supported and the parent directory is a valid - * "mon_groups" directory, add a monitoring subdirectory. - */ - if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name)) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* Update per cpu rmid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) { - per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; - per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; - } - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn = kn->parent; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, kn->name)) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - if (rdtgrp->type != RDTMON_GROUP || !kn->parent || - !is_mon_groups(kn->parent, kn->name)) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(&rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl)) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - if (!r->mon_capable) - goto out_unlock; - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d->id); - - if (is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - goto out_unlock; - } - - if (!r->mon_capable) - goto out_unlock; - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - struct rdtgroup *rdtgrp; - struct rdt_domain *d; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_domain_from_cpu(cpu, l3); - if (d) { - if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && - has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * rdtgroup_init - rdtgroup initialization - * - * Setup resctrl file system including set up root, create mount point, - * register rdtgroup filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init rdtgroup_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) - return ret; - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - return ret; -} - -void __exit rdtgroup_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); + return; } diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 0dad49a09b7a..dbf6d71bdf18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -24,31 +24,37 @@ struct cpuid_bit { * levels are different and there is a separate entry for each. */ static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, - { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, - { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, - { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, - { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, - { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, - { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, - { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, - { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, - { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, - { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, - { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, - { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, - { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, - { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, - { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, - { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, - { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, - { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, + { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, + { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 }, + { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 }, + { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 }, + { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 }, + { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 }, + { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, + { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index 262f5fb18d74..7f8d1e11dbee 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file, if (flags & MAP_FIXED) return addr; - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); } #ifdef CONFIG_COMPAT @@ -150,13 +150,15 @@ int __init sgx_drv_init(void) u64 xfrm_mask; int ret; - if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) + if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) { + pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; + } cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx); if (!(eax & 1)) { - pr_err("SGX disabled: SGX1 instruction support not available.\n"); + pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n"); return -ENODEV; } @@ -173,8 +175,10 @@ int __init sgx_drv_init(void) } ret = misc_register(&sgx_dev_enclave); - if (ret) + if (ret) { + pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret); return ret; + } return 0; } diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index b65ab214bdf5..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -64,6 +65,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs) struct file *backing; long ret; + /* + * ECREATE would detect this too, but checking here also ensures + * that the 'encl_size' calculations below can never overflow. + */ + if (!is_power_of_2(secs->size)) + return -EINVAL; + va_page = sgx_encl_grow(encl, true); if (IS_ERR(va_page)) return PTR_ERR(va_page); @@ -456,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -516,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 166692f2d501..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -13,6 +13,8 @@ #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/sysfs.h> +#include <linux/vmalloc.h> +#include <asm/msr.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -474,24 +476,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void) { struct sgx_epc_page *page; int nid_of_current = numa_node_id(); - int nid = nid_of_current; + int nid_start, nid; - if (node_isset(nid_of_current, sgx_numa_mask)) { - page = __sgx_alloc_epc_page_from_node(nid_of_current); - if (page) - return page; - } - - /* Fall back to the non-local NUMA nodes: */ - while (true) { - nid = next_node_in(nid, sgx_numa_mask); - if (nid == nid_of_current) - break; + /* + * Try local node first. If it doesn't have an EPC section, + * fall back to the non-local NUMA nodes. + */ + if (node_isset(nid_of_current, sgx_numa_mask)) + nid_start = nid_of_current; + else + nid_start = next_node_in(nid_of_current, sgx_numa_mask); + nid = nid_start; + do { page = __sgx_alloc_epc_page_from_node(nid); if (page) return page; - } + + nid = next_node_in(nid, sgx_numa_mask); + } while (nid != nid_start); return ERR_PTR(-ENOMEM); } @@ -628,7 +631,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size, if (!section->virt_addr) return false; - section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page)); + section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page)); if (!section->pages) { memunmap(section->virt_addr); return false; @@ -717,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until @@ -731,7 +736,7 @@ out: return 0; } -/** +/* * A section metric is concatenated in a way that @low bits 12-31 define the * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the * metric. @@ -846,6 +851,13 @@ static bool __init sgx_page_cache_init(void) return false; } + for_each_online_node(nid) { + if (!node_isset(nid, sgx_numa_mask) && + node_state(nid, N_MEMORY) && node_state(nid, N_CPU)) + pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n", + nid); + } + return true; } @@ -862,7 +874,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); + wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { @@ -892,19 +904,15 @@ static struct miscdevice sgx_dev_provision = { int sgx_set_attribute(unsigned long *allowed_attributes, unsigned int attribute_fd) { - struct fd f = fdget(attribute_fd); + CLASS(fd, f)(attribute_fd); - if (!f.file) + if (fd_empty(f)) return -EINVAL; - if (f.file->f_op != &sgx_provision_fops) { - fdput(f); + if (fd_file(f)->f_op != &sgx_provision_fops) return -EINVAL; - } *allowed_attributes |= SGX_ATTR_PROVISIONKEY; - - fdput(f); return 0; } EXPORT_SYMBOL_GPL(sgx_set_attribute); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index aaca8d235dc2..e35ccdc84910 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,6 +30,7 @@ #include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> +#include <asm/msr.h> #include <asm/smp.h> #include "cpu.h" @@ -123,12 +124,14 @@ static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id) early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id; early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id; #endif - set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } static __init bool check_for_real_bsp(u32 apic_id) { + bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6; + u64 msr; + /* * There is no real good way to detect whether this a kdump() * kernel, but except on the Voyager SMP monstrosity which is not @@ -145,17 +148,61 @@ static __init bool check_for_real_bsp(u32 apic_id) if (topo_info.real_bsp_apic_id != BAD_APICID) return false; + /* + * Check whether the enumeration order is broken by evaluating the + * BSP bit in the APICBASE MSR. If the CPU does not have the + * APICBASE MSR then the BSP detection is not possible and the + * kernel must rely on the firmware enumeration order. + */ + if (has_apic_base) { + rdmsrq(MSR_IA32_APICBASE, msr); + is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); + } + if (apic_id == topo_info.boot_cpu_apic_id) { - topo_info.real_bsp_apic_id = apic_id; - return false; + /* + * If the boot CPU has the APIC BSP bit set then the + * firmware enumeration is agreeing. If the CPU does not + * have the APICBASE MSR then the only choice is to trust + * the enumeration order. + */ + if (is_bsp || !has_apic_base) { + topo_info.real_bsp_apic_id = apic_id; + return false; + } + /* + * If the boot APIC is enumerated first, but the APICBASE + * MSR does not have the BSP bit set, then there is no way + * to discover the real BSP here. Assume a crash kernel and + * limit the number of CPUs to 1 as an INIT to the real BSP + * would reset the machine. + */ + pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id); + pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n"); + set_nr_cpu_ids(1); + goto fwbug; } - pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x > %x\n", + pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n", topo_info.boot_cpu_apic_id, apic_id); + + if (is_bsp) { + /* + * The boot CPU has the APIC BSP bit set. Use it and complain + * about the broken firmware enumeration. + */ + topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id; + goto fwbug; + } + pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n"); topo_info.real_bsp_apic_id = apic_id; return true; + +fwbug: + pr_warn(FW_BUG "APIC enumeration order not specification compliant\n"); + return false; } static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level, @@ -210,7 +257,11 @@ static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present) topo_info.nr_disabled_cpus++; } - /* Register present and possible CPUs in the domain maps */ + /* + * Register present and possible CPUs in the domain + * maps. cpu_possible_map will be updated in + * topology_init_possible_cpus() after enumeration is done. + */ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map); } @@ -378,8 +429,8 @@ void __init topology_apply_cmdline_limits_early(void) { unsigned int possible = nr_cpu_ids; - /* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' 'noapic' */ - if (!setup_max_cpus || ioapic_is_disabled || apic_is_disabled) + /* 'maxcpus=0' 'nosmp' 'nolapic' */ + if (!setup_max_cpus || apic_is_disabled) possible = 1; /* 'possible_cpus=N' */ @@ -393,7 +444,7 @@ void __init topology_apply_cmdline_limits_early(void) static __init bool restrict_to_up(void) { - if (!smp_found_config || ioapic_is_disabled) + if (!smp_found_config) return true; /* * XEN PV is special as it does not advertise the local APIC diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 1a8b3ad493af..843b1655ab45 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -3,6 +3,7 @@ #include <asm/apic.h> #include <asm/memtype.h> +#include <asm/msr.h> #include <asm/processor.h> #include "cpu.h" @@ -29,11 +30,21 @@ static bool parse_8000_0008(struct topo_scan *tscan) if (!sft) sft = get_count_order(ecx.cpu_nthreads + 1); - topology_set_dom(tscan, TOPO_SMT_DOMAIN, sft, ecx.cpu_nthreads + 1); + /* + * cpu_nthreads describes the number of threads in the package + * sft is the number of APIC ID bits per package + * + * As the number of actual threads per core is not described in + * this leaf, just set the CORE domain shift and let the later + * parsers set SMT shift. Assume one thread per core by default + * which is correct if there are no other CPUID leafs to parse. + */ + topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1); + topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1); return true; } -static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_id) +static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id) { /* * Starting with Fam 17h the DIE domain could probably be used to @@ -48,7 +59,7 @@ static void store_node(struct topo_scan *tscan, unsigned int nr_nodes, u16 node_ tscan->amd_node_id = node_id; } -static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) +static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext) { struct { // eax @@ -73,12 +84,14 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) tscan->c->topo.initial_apicid = leaf.ext_apic_id; /* - * If leaf 0xb is available, then SMT shift is set already. If not - * take it from ecx.threads_per_core and use topo_update_dom() - - * topology_set_dom() would propagate and overwrite the already - * propagated CORE level. + * If leaf 0xb is available, then the domain shifts are set + * already and nothing to do here. Only valid for family >= 0x17. */ - if (!has_0xb) { + if (!has_topoext && tscan->c->x86 >= 0x17) { + /* + * Leaf 0x80000008 set the CORE domain shift already. + * Update the SMT domain, but do not propagate it. + */ unsigned int nthreads = leaf.core_nthreads + 1; topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads); @@ -107,64 +120,89 @@ static bool parse_8000_001e(struct topo_scan *tscan, bool has_0xb) return true; } -static bool parse_fam10h_node_id(struct topo_scan *tscan) +static void parse_fam10h_node_id(struct topo_scan *tscan) { - struct { - union { + union { + struct { u64 node_id : 3, nodes_per_pkg : 3, unused : 58; - u64 msr; }; + u64 msr; } nid; if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) - return false; + return; - rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + rdmsrq(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; - return true; } static void legacy_set_llc(struct topo_scan *tscan) { unsigned int apicid = tscan->c->topo.initial_apicid; - /* parse_8000_0008() set everything up except llc_id */ - tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; + /* If none of the parsers set LLC ID then use the die ID for it. */ + if (tscan->c->topo.llc_id == BAD_APICID) + tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN]; +} + +static void topoext_fixup(struct topo_scan *tscan) +{ + struct cpuinfo_x86 *c = tscan->c; + u64 msrval; + + /* Try to re-enable TopologyExtensions if switched off by BIOS */ + if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD || + c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f) + return; + + if (msr_set_bit(0xc0011005, 54) <= 0) + return; + + rdmsrq(0xc0011005, msrval); + if (msrval & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } } static void parse_topology_amd(struct topo_scan *tscan) { - bool has_0xb = false; + bool has_topoext = false; /* * If the extended topology leaf 0x8000_001e is available - * try to get SMT and CORE shift from leaf 0xb first, then - * try to get the CORE shift from leaf 0x8000_0008. + * try to get SMT, CORE, TILE, and DIE shifts from extended + * CPUID leaf 0x8000_0026 on supported processors first. If + * extended CPUID leaf 0x8000_0026 is not supported, try to + * get SMT and CORE shift from leaf 0xb first, then try to + * get the CORE shift from leaf 0x8000_0008. */ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) - has_0xb = cpu_parse_topology_ext(tscan); + has_topoext = cpu_parse_topology_ext(tscan); + + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) + tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); - if (!has_0xb && !parse_8000_0008(tscan)) + if (!has_topoext && !parse_8000_0008(tscan)) return; /* Prefer leaf 0x8000001e if available */ - if (parse_8000_001e(tscan, has_0xb)) + if (parse_8000_001e(tscan, has_topoext)) return; /* Try the NODEID MSR */ - if (parse_fam10h_node_id(tscan)) - return; - - legacy_set_llc(tscan); + parse_fam10h_node_id(tscan); } void cpu_parse_topology_amd(struct topo_scan *tscan) { tscan->amd_nodes_per_pkg = 1; + topoext_fixup(tscan); parse_topology_amd(tscan); + legacy_set_llc(tscan); if (tscan->amd_nodes_per_pkg > 1) set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM); diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c index 9a6069e7133c..b5a5e1411469 100644 --- a/arch/x86/kernel/cpu/topology_common.c +++ b/arch/x86/kernel/cpu/topology_common.c @@ -3,6 +3,7 @@ #include <xen/xen.h> +#include <asm/intel-family.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/smp.h> @@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, } } +enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor == X86_VENDOR_INTEL) { + switch (c->topo.intel_type) { + case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY; + case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE; + } + } + if (c->x86_vendor == X86_VENDOR_AMD) { + switch (c->topo.amd_type) { + case 0: return TOPO_CPU_TYPE_PERFORMANCE; + case 1: return TOPO_CPU_TYPE_EFFICIENCY; + } + } + + return TOPO_CPU_TYPE_UNKNOWN; +} + +const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c) +{ + switch (get_topology_cpu_type(c)) { + case TOPO_CPU_TYPE_PERFORMANCE: + return "performance"; + case TOPO_CPU_TYPE_EFFICIENCY: + return "efficiency"; + default: + return "unknown"; + } +} + static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) { struct { @@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) .cu_id = 0xff, .llc_id = BAD_APICID, .l2c_id = BAD_APICID, + .cpu_type = TOPO_CPU_TYPE_UNKNOWN, }; struct cpuinfo_x86 *c = tscan->c; struct { @@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early) case X86_VENDOR_INTEL: if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) parse_legacy(tscan); + if (c->cpuid_level >= 0x1a) + c->topo.cpu_type = cpuid_eax(0x1a); break; case X86_VENDOR_HYGON: if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) @@ -151,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) if (!early) { c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); + c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); } /* Package relative core ID */ diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c index e477228cd5b2..467b0326bf1a 100644 --- a/arch/x86/kernel/cpu/topology_ext.c +++ b/arch/x86/kernel/cpu/topology_ext.c @@ -13,7 +13,10 @@ enum topo_types { CORE_TYPE = 2, MAX_TYPE_0B = 3, MODULE_TYPE = 3, + AMD_CCD_TYPE = 3, TILE_TYPE = 4, + AMD_SOCKET_TYPE = 4, + MAX_TYPE_80000026 = 5, DIE_TYPE = 5, DIEGRP_TYPE = 6, MAX_TYPE_1F = 7, @@ -32,6 +35,13 @@ static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = { [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN, }; +static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = { + [SMT_TYPE] = TOPO_SMT_DOMAIN, + [CORE_TYPE] = TOPO_CORE_DOMAIN, + [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN, + [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN, +}; + static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, unsigned int *last_dom) { @@ -56,6 +66,7 @@ static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf, switch (leaf) { case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break; case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break; + case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break; default: return false; } @@ -125,6 +136,10 @@ bool cpu_parse_topology_ext(struct topo_scan *tscan) if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f)) return true; + /* AMD: Try leaf 0x80000026 first. */ + if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026)) + return true; + /* Intel/AMD: Fall back to leaf 0xB if available */ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b); } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index b31ee4f1657a..49782724a943 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -12,6 +12,7 @@ #include <asm/cmdline.h> #include <asm/cpu.h> +#include <asm/msr.h> #include "cpu.h" @@ -24,7 +25,7 @@ static void tsx_disable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Force all transactions to immediately abort */ tsx |= TSX_CTRL_RTM_DISABLE; @@ -37,14 +38,14 @@ static void tsx_disable(void) */ tsx |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static void tsx_enable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Enable the RTM feature in the cpu */ tsx &= ~TSX_CTRL_RTM_DISABLE; @@ -56,7 +57,7 @@ static void tsx_enable(void) */ tsx &= ~TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) @@ -115,13 +116,13 @@ static void tsx_clear_cpuid(void) */ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { - rdmsrl(MSR_TSX_FORCE_ABORT, msr); + rdmsrq(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; - wrmsrl(MSR_TSX_FORCE_ABORT, msr); + wrmsrq(MSR_TSX_FORCE_ABORT, msr); } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { - rdmsrl(MSR_IA32_TSX_CTRL, msr); + rdmsrq(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, msr); + wrmsrq(MSR_IA32_TSX_CTRL, msr); } } @@ -146,11 +147,11 @@ static void tsx_dev_mode_disable(void) !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); if (mcu_opt_ctrl & RTM_ALLOW) { mcu_opt_ctrl &= ~RTM_ALLOW; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 2293efd6ffa6..933fcd7ff250 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock); static void umwait_update_control_msr(void * unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached)); } /* @@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu) * the original control MSR value in umwait_init(). So there * is no race condition here. */ - wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); return 0; } @@ -214,7 +214,7 @@ static int __init umwait_init(void) * changed. This is the only place where orig_umwait_control_cached * is modified. */ - rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, umwait_cpu_offline); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 11f83d07925e..cb3f900c46fc 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -26,6 +26,7 @@ #include <linux/export.h> #include <linux/clocksource.h> #include <linux/cpu.h> +#include <linux/efi.h> #include <linux/reboot.h> #include <linux/static_call.h> #include <asm/div64.h> @@ -41,80 +42,97 @@ #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define CPUID_VMWARE_FEATURES_LEAF 0x40000010 -#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0) -#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1) -#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 - -#define VMWARE_CMD_GETVERSION 10 -#define VMWARE_CMD_GETHZ 45 -#define VMWARE_CMD_GETVCPU_INFO 68 -#define VMWARE_CMD_LEGACY_X2APIC 3 -#define VMWARE_CMD_VCPU_RESERVED 31 -#define VMWARE_CMD_STEALCLOCK 91 +#define GETVCPU_INFO_LEGACY_X2APIC BIT(3) +#define GETVCPU_INFO_VCPU_RESERVED BIT(31) #define STEALCLOCK_NOT_AVAILABLE (-1) #define STEALCLOCK_DISABLED 0 #define STEALCLOCK_ENABLED 1 -#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx), %%eax" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \ - __asm__("vmcall" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(0), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \ - __asm__("vmmcall" : \ - "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ - "a"(VMWARE_HYPERVISOR_MAGIC), \ - "c"(VMWARE_CMD_##cmd), \ - "d"(0), "b"(UINT_MAX) : \ - "memory") - -#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \ - switch (vmware_hypercall_mode) { \ - case CPUID_VMWARE_FEATURES_ECX_VMCALL: \ - VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \ - break; \ - case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \ - VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \ - break; \ - default: \ - VMWARE_PORT(cmd, eax, ebx, ecx, edx); \ - break; \ - } \ - } while (0) - struct vmware_steal_time { union { - uint64_t clock; /* stolen time counter in units of vtsc */ + u64 clock; /* stolen time counter in units of vtsc */ struct { /* only for little-endian */ - uint32_t clock_low; - uint32_t clock_high; + u32 clock_low; + u32 clock_high; }; }; - uint64_t reserved[7]; + u64 reserved[7]; }; static unsigned long vmware_tsc_khz __ro_after_init; static u8 vmware_hypercall_mode __ro_after_init; +unsigned long vmware_hypercall_slow(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5) +{ + unsigned long out0, rbx, rcx, rdx, rsi, rdi; + + switch (vmware_hypercall_mode) { + case CPUID_VMWARE_FEATURES_ECX_VMCALL: + asm_inline volatile ("vmcall" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + case CPUID_VMWARE_FEATURES_ECX_VMMCALL: + asm_inline volatile ("vmmcall" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + default: + asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax" + : "=a" (out0), "=b" (rbx), "=c" (rcx), + "=d" (rdx), "=S" (rsi), "=D" (rdi) + : [port] "i" (VMWARE_HYPERVISOR_PORT), + "a" (VMWARE_HYPERVISOR_MAGIC), + "b" (in1), + "c" (cmd), + "d" (in3), + "S" (in4), + "D" (in5) + : "cc", "memory"); + break; + } + + if (out1) + *out1 = rbx; + if (out2) + *out2 = rcx; + if (out3) + *out3 = rdx; + if (out4) + *out4 = rsi; + if (out5) + *out5 = rdi; + + return out0; +} + static inline int __vmware_platform(void) { - uint32_t eax, ebx, ecx, edx; - VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx); - return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; + u32 eax, ebx, ecx; + + eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx); + return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC; } static unsigned long vmware_get_tsc_khz(void) @@ -166,21 +184,12 @@ static void __init vmware_cyc2ns_setup(void) pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset); } -static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2) +static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo) { - uint32_t result, info; - - asm volatile (VMWARE_HYPERCALL : - "=a"(result), - "=c"(info) : - "a"(VMWARE_HYPERVISOR_MAGIC), - "b"(0), - "c"(VMWARE_CMD_STEALCLOCK), - "d"(0), - "S"(arg1), - "D"(arg2) : - "memory"); - return result; + u32 info; + + return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo, + &info); } static bool stealclock_enable(phys_addr_t pa) @@ -215,15 +224,15 @@ static bool vmware_is_stealclock_available(void) * Return: * The steal clock reading in ns. */ -static uint64_t vmware_steal_clock(int cpu) +static u64 vmware_steal_clock(int cpu) { struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu); - uint64_t clock; + u64 clock; if (IS_ENABLED(CONFIG_64BIT)) clock = READ_ONCE(steal->clock); else { - uint32_t initial_high, low, high; + u32 initial_high, low, high; do { initial_high = READ_ONCE(steal->clock_high); @@ -235,7 +244,7 @@ static uint64_t vmware_steal_clock(int cpu) high = READ_ONCE(steal->clock_high); } while (initial_high != high); - clock = ((uint64_t)high << 32) | low; + clock = ((u64)high << 32) | low; } return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul, @@ -389,13 +398,13 @@ static void __init vmware_set_capabilities(void) static void __init vmware_platform_setup(void) { - uint32_t eax, ebx, ecx, edx; - uint64_t lpj, tsc_khz; + u32 eax, ebx, ecx; + u64 lpj, tsc_khz; - VMWARE_CMD(GETHZ, eax, ebx, ecx, edx); + eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx); if (ebx != UINT_MAX) { - lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + lpj = tsc_khz = eax | (((u64)ebx) << 32); do_div(tsc_khz, 1000); WARN_ON(tsc_khz >> 32); pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", @@ -421,6 +430,9 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT)) + x86_init.mpparse.find_mptable = mpparse_find_mptable; + vmware_paravirt_ops_setup(); #ifdef CONFIG_X86_IO_APIC @@ -446,7 +458,7 @@ static u8 __init vmware_select_hypercall(void) * If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode * intentionally defaults to 0. */ -static uint32_t __init vmware_platform(void) +static u32 __init vmware_platform(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { unsigned int eax; @@ -474,11 +486,64 @@ static uint32_t __init vmware_platform(void) /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ static bool __init vmware_legacy_x2apic_available(void) { - uint32_t eax, ebx, ecx, edx; - VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx); - return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) && - (eax & BIT(VMWARE_CMD_LEGACY_X2APIC)); + u32 eax; + + eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0); + return !(eax & GETVCPU_INFO_VCPU_RESERVED) && + (eax & GETVCPU_INFO_LEGACY_X2APIC); +} + +#ifdef CONFIG_INTEL_TDX_GUEST +/* + * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore, + * we remap those registers to %r12 and %r13, respectively. + */ +unsigned long vmware_tdx_hypercall(unsigned long cmd, + unsigned long in1, unsigned long in3, + unsigned long in4, unsigned long in5, + u32 *out1, u32 *out2, u32 *out3, + u32 *out4, u32 *out5) +{ + struct tdx_module_args args = {}; + + if (!hypervisor_is_type(X86_HYPER_VMWARE)) { + pr_warn_once("Incorrect usage\n"); + return ULONG_MAX; + } + + if (cmd & ~VMWARE_CMD_MASK) { + pr_warn_once("Out of range command %lx\n", cmd); + return ULONG_MAX; + } + + args.rbx = in1; + args.rdx = in3; + args.rsi = in4; + args.rdi = in5; + args.r10 = VMWARE_TDX_VENDOR_LEAF; + args.r11 = VMWARE_TDX_HCALL_FUNC; + args.r12 = VMWARE_HYPERVISOR_MAGIC; + args.r13 = cmd; + /* CPL */ + args.r15 = 0; + + __tdx_hypercall(&args); + + if (out1) + *out1 = args.rbx; + if (out2) + *out2 = args.r13; + if (out3) + *out3 = args.rdx; + if (out4) + *out4 = args.rsi; + if (out5) + *out5 = args.rdi; + + return args.r12; } +EXPORT_SYMBOL_GPL(vmware_tdx_hypercall); +#endif #ifdef CONFIG_AMD_MEM_ENCRYPT static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb, diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 90eba7eb5335..89b1c8a70fe8 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> +#include <asm/msr.h> #include "cpu.h" diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e74d0c4286c1..bcb534688dfe 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -128,7 +128,19 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif - crash_save_cpu(regs, safe_smp_processor_id()); + + /* + * Non-crash kexec calls enc_kexec_begin() while scheduling is still + * active. This allows the callback to wait until all in-flight + * shared<->private conversions are complete. In a crash scenario, + * enc_kexec_begin() gets called after all but one CPU have been shut + * down and interrupts have been disabled. This allows the callback to + * detect a race with the conversion and report it. + */ + x86_platform.guest.enc_kexec_begin(); + x86_platform.guest.enc_kexec_finish(); + + crash_save_cpu(regs, smp_processor_id()); } #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG) @@ -266,6 +278,7 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; + int ret; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; @@ -274,22 +287,43 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, /* Exclude elf header region */ start = image->elf_load_addr; end = start + image->elf_headers_sz - 1; - return crash_exclude_mem_range(cmem, start, end); + ret = crash_exclude_mem_range(cmem, start, end); + + if (ret) + return ret; + + /* Exclude dm crypt keys region */ + if (image->dm_crypt_keys_addr) { + start = image->dm_crypt_keys_addr; + end = start + image->dm_crypt_keys_sz - 1; + return crash_exclude_mem_range(cmem, start, end); + } + + return ret; } /* Prepare memory map for crash dump kernel */ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) { + unsigned int nr_ranges = 0; int i, ret = 0; unsigned long flags; struct e820_entry ei; struct crash_memmap_data cmd; struct crash_mem *cmem; - cmem = vzalloc(struct_size(cmem, ranges, 1)); + /* + * Using random kexec_buf for passing dm crypt keys may cause a range + * split. So use two slots here. + */ + nr_ranges = 2; + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return -ENOMEM; + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; @@ -402,20 +436,26 @@ int crash_load_segments(struct kimage *image) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt -/* These functions provide the value for the sysfs crash_hotplug nodes */ -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void) +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) { - return crash_check_update_elfcorehdr(); -} -#endif -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void) -{ - return crash_check_update_elfcorehdr(); -} +#ifdef CONFIG_KEXEC_FILE + if (image->file_mode) + return 1; #endif + /* + * Initially, crash hotplug support for kexec_load was added + * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this + * functionality was expanded to accommodate multiple kexec + * segment updates, leading to the introduction of the + * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently, + * when the kexec tool sends either of these flags, it indicates + * that the required kexec segment (elfcorehdr) is excluded from + * the SHA calculation. + */ + return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR || + kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT); +} unsigned int arch_crash_get_elfcorehdr_size(void) { @@ -432,10 +472,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void) /** * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes * @image: a pointer to kexec_crash_image + * @arg: struct memory_notify handler for memory hotplug case and + * NULL for CPU hotplug case. * * Prepare the new elfcorehdr and replace the existing elfcorehdr. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *elfbuf = NULL, *old_elfcorehdr; unsigned long nr_mem_ranges; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 003e0298f46a..dd8748c45529 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ /* * Architecture specific OF callbacks. */ +#include <linux/acpi.h> #include <linux/export.h> #include <linux/io.h> #include <linux/interrupt.h> @@ -24,6 +25,7 @@ #include <asm/pci_x86.h> #include <asm/setup.h> #include <asm/i8259.h> +#include <asm/numa.h> #include <asm/prom.h> __initdata u64 initial_dtb; @@ -82,7 +84,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev) ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (ret) - return ret; + return pcibios_err_to_errno(ret); if (!pin) return 0; @@ -137,6 +139,7 @@ static void __init dtb_cpu_setup(void) continue; } topology_register_apic(apic_id, CPU_ACPIID_INVALID, true); + set_apicid_to_node(apic_id, of_node_to_nid(dn)); } } @@ -277,9 +280,18 @@ static void __init dtb_apic_setup(void) dtb_ioapic_setup(); } -#ifdef CONFIG_OF_EARLY_FLATTREE +static void __init x86_dtb_parse_smp_config(void) +{ + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + void __init x86_flattree_get_config(void) { +#ifdef CONFIG_OF_EARLY_FLATTREE u32 size, map_len; void *dt; @@ -294,21 +306,14 @@ void __init x86_flattree_get_config(void) map_len = size; } - early_init_dt_verify(dt); + early_init_dt_verify(dt, __pa(dt)); } unflatten_and_copy_device_tree(); if (initial_dtb) early_memunmap(dt, map_len); -} #endif - -void __init x86_dtb_parse_smp_config(void) -{ - if (!of_have_populated_dt()) - return; - - dtb_setup_hpet(); - dtb_apic_setup(); + if (acpi_disabled && of_have_populated_dt()) + x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 44a91ef5a23b..71ee20102a8a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -23,8 +23,6 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -int panic_on_unrecovered_nmi; -int panic_on_io_nmi; static int die_counter; static struct pt_regs exec_summary_regs; @@ -195,6 +193,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ?: get_stack_pointer(task, regs); regs = unwind_get_entry_regs(&state, &partial); /* @@ -213,9 +212,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - hardirq stack * - entry stack */ - for (stack = stack ?: get_stack_pointer(task, regs); - stack; - stack = stack_info.next_sp) { + for (; stack; stack = stack_info.next_sp) { const char *stack_name; stack = PTR_ALIGN(stack, sizeof(long)); @@ -395,18 +392,13 @@ NOKPROBE_SYMBOL(oops_end); static void __die_header(const char *str, struct pt_regs *regs, long err) { - const char *pr = ""; - /* Save the regs of the first oops for the executive summary later. */ if (!die_counter) exec_summary_regs = *regs; - if (IS_ENABLED(CONFIG_PREEMPTION)) - pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; - printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, - pr, + "Oops: %s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, + ++die_counter, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b4905d5173fd..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -37,7 +37,7 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* @@ -62,7 +62,7 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(pcpu_hot.softirq_stack_ptr); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index f05339fee778..6c5defd6569a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -134,7 +134,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(pcpu_hot.hardirq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin; /* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6f1b379e3b38..c3acbd26408b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -28,18 +28,13 @@ * the first 128 E820 memory entries in boot_params.e820_table and the remaining * (if any) entries of the SETUP_E820_EXT nodes. We use this to: * - * - inform the user about the firmware's notion of memory layout - * via /sys/firmware/memmap - * * - the hibernation code uses it to generate a kernel-independent CRC32 * checksum of the physical memory layout of a system. * * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version * passed to us by the bootloader - the major difference between - * e820_table_firmware[] and this one is that, the latter marks the setup_data - * list created by the EFI boot stub as reserved, so that kexec can reuse the - * setup_data information in the second kernel. Besides, e820_table_kexec[] - * might also be modified by the kexec itself to fake a mptable. + * e820_table_firmware[] and this one is that e820_table_kexec[] + * might be modified by the kexec itself to fake an mptable. * We use this to: * * - kexec, which is a bootloader in disguise, uses the original E820 @@ -47,6 +42,11 @@ * can have a restricted E820 map while the kexec()-ed kexec-kernel * can have access to full memory - etc. * + * Export the memory layout via /sys/firmware/memmap. kexec-tools uses + * the entries to create an E820 table for the kexec kernel. + * + * kexec_file_load in-kernel code uses the table for the kexec kernel. + * * - 'e820_table': this is the main E820 table that is massaged by the * low level x86 platform code, or modified by boot parameters, before * passed on to higher level MM layers. @@ -187,8 +187,7 @@ void __init e820__range_add(u64 start, u64 size, enum e820_type type) static void __init e820_print_type(enum e820_type type) { switch (type) { - case E820_TYPE_RAM: /* Fall through: */ - case E820_TYPE_RESERVED_KERN: pr_cont("usable"); break; + case E820_TYPE_RAM: pr_cont("usable"); break; case E820_TYPE_RESERVED: pr_cont("reserved"); break; case E820_TYPE_SOFT_RESERVED: pr_cont("soft reserved"); break; case E820_TYPE_ACPI: pr_cont("ACPI data"); break; @@ -532,9 +531,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum return __e820__range_update(e820_table, start, size, old_type, new_type); } -static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type) +u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size, + enum e820_type old_type, enum e820_type new_type) { - return __e820__range_update(e820_table_kexec, start, size, old_type, new_type); + return __e820__range_update(t, start, size, old_type, new_type); } /* Remove a range of memory from the E820 table: */ @@ -753,22 +753,21 @@ void __init e820__memory_setup_extended(u64 phys_addr, u32 data_len) void __init e820__register_nosave_regions(unsigned long limit_pfn) { int i; - unsigned long pfn = 0; + u64 last_addr = 0; for (i = 0; i < e820_table->nr_entries; i++) { struct e820_entry *entry = &e820_table->entries[i]; - if (pfn < PFN_UP(entry->addr)) - register_nosave_region(pfn, PFN_UP(entry->addr)); - - pfn = PFN_DOWN(entry->addr + entry->size); + if (entry->type != E820_TYPE_RAM) + continue; - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) - register_nosave_region(PFN_UP(entry->addr), pfn); + if (last_addr < entry->addr) + register_nosave_region(PFN_DOWN(last_addr), PFN_UP(entry->addr)); - if (pfn >= limit_pfn) - break; + last_addr = entry->addr + entry->size; } + + register_nosave_region(PFN_DOWN(last_addr), limit_pfn); } #ifdef CONFIG_ACPI @@ -806,7 +805,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) addr = memblock_phys_alloc(size, align); if (addr) { - e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED); pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n"); e820__update_table_kexec(); } @@ -827,7 +826,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type) +static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn) { int i; unsigned long last_pfn = 0; @@ -838,7 +837,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type unsigned long start_pfn; unsigned long end_pfn; - if (entry->type != type) + if (entry->type != E820_TYPE_RAM && + entry->type != E820_TYPE_ACPI) continue; start_pfn = entry->addr >> PAGE_SHIFT; @@ -864,12 +864,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type unsigned long __init e820__end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM); + return e820__end_ram_pfn(MAX_ARCH_PFN); } unsigned long __init e820__end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM); + return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT)); } static void __init early_panic(char *msg) @@ -989,60 +989,6 @@ static int __init parse_memmap_opt(char *str) early_param("memmap", parse_memmap_opt); /* - * Reserve all entries from the bootloader's extensible data nodes list, - * because if present we are going to use it later on to fetch e820 - * entries from it: - */ -void __init e820__reserve_setup_data(void) -{ - struct setup_indirect *indirect; - struct setup_data *data; - u64 pa_data, pa_next; - u32 len; - - pa_data = boot_params.hdr.setup_data; - if (!pa_data) - return; - - while (pa_data) { - data = early_memremap(pa_data, sizeof(*data)); - if (!data) { - pr_warn("e820: failed to memremap setup_data entry\n"); - return; - } - - len = sizeof(*data); - pa_next = data->next; - - e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - - if (data->type == SETUP_INDIRECT) { - len += data->len; - early_memunmap(data, sizeof(*data)); - data = early_memremap(pa_data, len); - if (!data) { - pr_warn("e820: failed to memremap indirect setup_data\n"); - return; - } - - indirect = (struct setup_indirect *)data->data; - - if (indirect->type != SETUP_INDIRECT) - e820__range_update(indirect->addr, indirect->len, - E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); - } - - pa_data = pa_next; - early_memunmap(data, len); - } - - e820__update_table(e820_table); - - pr_info("extended physical RAM map:\n"); - e820__print_table("reserve setup_data"); -} - -/* * Called after parse_early_param(), after early parameters (such as mem=) * have been processed, in which case we already have an E820 table filled in * via the parameter callback function(s), but it's not sorted and printed yet: @@ -1061,7 +1007,6 @@ void __init e820__finish_early_params(void) static const char *__init e820_type_to_string(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return "System RAM"; case E820_TYPE_ACPI: return "ACPI Tables"; case E820_TYPE_NVS: return "ACPI Non-volatile Storage"; @@ -1077,7 +1022,6 @@ static const char *__init e820_type_to_string(struct e820_entry *entry) static unsigned long __init e820_type_to_iomem_type(struct e820_entry *entry) { switch (entry->type) { - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: return IORESOURCE_SYSTEM_RAM; case E820_TYPE_ACPI: /* Fall-through: */ case E820_TYPE_NVS: /* Fall-through: */ @@ -1099,7 +1043,6 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_SOFT_RESERVED: return IORES_DESC_SOFT_RESERVED; - case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ default: return IORES_DESC_NONE; @@ -1122,7 +1065,6 @@ static bool __init do_mark_busy(enum e820_type type, struct resource *res) case E820_TYPE_PRAM: case E820_TYPE_PMEM: return false; - case E820_TYPE_RESERVED_KERN: case E820_TYPE_RAM: case E820_TYPE_ACPI: case E820_TYPE_NVS: @@ -1144,11 +1086,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res) * e820_table->nr_entries); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { @@ -1177,9 +1116,9 @@ void __init e820__reserve_resources(void) res++; } - /* Expose the bootloader-provided memory layout to the sysfs. */ - for (i = 0; i < e820_table_firmware->nr_entries; i++) { - struct e820_entry *entry = e820_table_firmware->entries + i; + /* Expose the kexec e820 table to the sysfs. */ + for (i = 0; i < e820_table_kexec->nr_entries; i++) { + struct e820_entry *entry = e820_table_kexec->entries + i; firmware_map_add_early(entry->addr, entry->addr + entry->size, e820_type_to_string(entry)); } @@ -1303,6 +1242,36 @@ void __init e820__memblock_setup(void) int i; u64 end; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + + /* + * At this point only the first megabyte is mapped for sure, the + * rest of the memory cannot be used for memblock resizing + */ + memblock_set_current_limit(ISA_END_ADDRESS); + /* * The bootstrap memblock region count maximum is 128 entries * (INIT_MEMBLOCK_REGIONS), but EFI might pass us more E820 entries @@ -1324,12 +1293,38 @@ void __init e820__memblock_setup(void) if (entry->type == E820_TYPE_SOFT_RESERVED) memblock_reserve(entry->addr, entry->size); - if (entry->type != E820_TYPE_RAM && entry->type != E820_TYPE_RESERVED_KERN) + if (entry->type != E820_TYPE_RAM) continue; memblock_add(entry->addr, entry->size); } + /* + * At this point memblock is only allowed to allocate from memory + * below 1M (aka ISA_END_ADDRESS) up until direct map is completely set + * up in init_mem_mapping(). + * + * KHO kernels are special and use only scratch memory for memblock + * allocations, but memory below 1M is ignored by kernel after early + * boot and cannot be naturally marked as scratch. + * + * To allow allocation of the real-mode trampoline and a few (if any) + * other very early allocations from below 1M forcibly mark the memory + * below 1M as scratch. + * + * After real mode trampoline is allocated, we clear that scratch + * marking. + */ + memblock_mark_kho_scratch(0, SZ_1M); + + /* + * 32-bit systems are limited to 4BG of memory even with HIGHMEM and + * to even less without it. + * Discard memory after max_pfn - the actual limit detected at runtime. + */ + if (IS_ENABLED(CONFIG_X86_32)) + memblock_remove(PFN_PHYS(max_pfn), -1); + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 59f4aefc6bc1..6b6f32f40cbe 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,8 +17,8 @@ #include <linux/bcma/bcma.h> #include <linux/bcma/bcma_regs.h> #include <linux/platform_data/x86/apple.h> -#include <drm/i915_drm.h> -#include <drm/i915_pciids.h> +#include <drm/intel/i915_drm.h> +#include <drm/intel/pciids.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -518,47 +518,46 @@ static const struct intel_early_ops gen11_early_ops __initconst = { /* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { - INTEL_I830_IDS(&i830_early_ops), - INTEL_I845G_IDS(&i845_early_ops), - INTEL_I85X_IDS(&i85x_early_ops), - INTEL_I865G_IDS(&i865_early_ops), - INTEL_I915G_IDS(&gen3_early_ops), - INTEL_I915GM_IDS(&gen3_early_ops), - INTEL_I945G_IDS(&gen3_early_ops), - INTEL_I945GM_IDS(&gen3_early_ops), - INTEL_VLV_IDS(&gen6_early_ops), - INTEL_PINEVIEW_G_IDS(&gen3_early_ops), - INTEL_PINEVIEW_M_IDS(&gen3_early_ops), - INTEL_I965G_IDS(&gen3_early_ops), - INTEL_G33_IDS(&gen3_early_ops), - INTEL_I965GM_IDS(&gen3_early_ops), - INTEL_GM45_IDS(&gen3_early_ops), - INTEL_G45_IDS(&gen3_early_ops), - INTEL_IRONLAKE_D_IDS(&gen3_early_ops), - INTEL_IRONLAKE_M_IDS(&gen3_early_ops), - INTEL_SNB_D_IDS(&gen6_early_ops), - INTEL_SNB_M_IDS(&gen6_early_ops), - INTEL_IVB_M_IDS(&gen6_early_ops), - INTEL_IVB_D_IDS(&gen6_early_ops), - INTEL_HSW_IDS(&gen6_early_ops), - INTEL_BDW_IDS(&gen8_early_ops), - INTEL_CHV_IDS(&chv_early_ops), - INTEL_SKL_IDS(&gen9_early_ops), - INTEL_BXT_IDS(&gen9_early_ops), - INTEL_KBL_IDS(&gen9_early_ops), - INTEL_CFL_IDS(&gen9_early_ops), - INTEL_GLK_IDS(&gen9_early_ops), - INTEL_CNL_IDS(&gen9_early_ops), - INTEL_ICL_11_IDS(&gen11_early_ops), - INTEL_EHL_IDS(&gen11_early_ops), - INTEL_JSL_IDS(&gen11_early_ops), - INTEL_TGL_12_IDS(&gen11_early_ops), - INTEL_RKL_IDS(&gen11_early_ops), - INTEL_ADLS_IDS(&gen11_early_ops), - INTEL_ADLP_IDS(&gen11_early_ops), - INTEL_ADLN_IDS(&gen11_early_ops), - INTEL_RPLS_IDS(&gen11_early_ops), - INTEL_RPLP_IDS(&gen11_early_ops), + INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops), + INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops), + INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops), + INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops), + INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops), + INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops), + INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops), + INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops), + INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops), + INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), + INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 44f937015e1e..cba75306e5b6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/console.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> @@ -19,6 +20,7 @@ #include <linux/usb/ehci_def.h> #include <linux/usb/xhci-dbgp.h> #include <asm/pci_x86.h> +#include <linux/static_call.h> /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -94,26 +96,28 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ -static unsigned int io_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int io_serial_in(unsigned long addr, int offset) { return inb(addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_in); -static void io_serial_out(unsigned long addr, int offset, int value) +static __noendbr void io_serial_out(unsigned long addr, int offset, int value) { outb(value, addr + offset); } +ANNOTATE_NOENDBR_SYM(io_serial_out); -static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in; -static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out; +DEFINE_STATIC_CALL(serial_in, io_serial_in); +DEFINE_STATIC_CALL(serial_out, io_serial_out); static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; - while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) + while ((static_call(serial_in)(early_serial_base, LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); - serial_out(early_serial_base, TXR, ch); + static_call(serial_out)(early_serial_base, TXR, ch); return timeout ? 0 : -1; } @@ -131,16 +135,21 @@ static __init void early_serial_hw_init(unsigned divisor) { unsigned char c; - serial_out(early_serial_base, LCR, 0x3); /* 8n1 */ - serial_out(early_serial_base, IER, 0); /* no interrupt */ - serial_out(early_serial_base, FCR, 0); /* no fifo */ - serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */ + static_call(serial_out)(early_serial_base, LCR, 0x3); /* 8n1 */ + static_call(serial_out)(early_serial_base, IER, 0); /* no interrupt */ + static_call(serial_out)(early_serial_base, FCR, 0); /* no fifo */ + static_call(serial_out)(early_serial_base, MCR, 0x3); /* DTR + RTS */ - c = serial_in(early_serial_base, LCR); - serial_out(early_serial_base, LCR, c | DLAB); - serial_out(early_serial_base, DLL, divisor & 0xff); - serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff); - serial_out(early_serial_base, LCR, c & ~DLAB); + c = static_call(serial_in)(early_serial_base, LCR); + static_call(serial_out)(early_serial_base, LCR, c | DLAB); + static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); + static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); + static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); + +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + if (static_call_query(serial_in) == io_serial_in) + kexec_debug_8250_port = early_serial_base; +#endif } #define DEFAULT_BAUD 9600 @@ -183,30 +192,66 @@ static __init void early_serial_init(char *s) /* Convert from baud to divisor value */ divisor = 115200 / baud; - /* These will always be IO based ports */ - serial_in = io_serial_in; - serial_out = io_serial_out; - /* Set up the HW */ early_serial_hw_init(divisor); } -#ifdef CONFIG_PCI -static void mem32_serial_out(unsigned long addr, int offset, int value) +static __noendbr void mem32_serial_out(unsigned long addr, int offset, int value) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ writel(value, vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_out); -static unsigned int mem32_serial_in(unsigned long addr, int offset) +static __noendbr unsigned int mem32_serial_in(unsigned long addr, int offset) { u32 __iomem *vaddr = (u32 __iomem *)addr; /* shift implied by pointer type */ return readl(vaddr + offset); } +ANNOTATE_NOENDBR_SYM(mem32_serial_in); /* + * early_mmio_serial_init() - Initialize MMIO-based early serial console. + * @s: MMIO-based serial specification. + */ +static __init void early_mmio_serial_init(char *s) +{ + unsigned long baudrate; + unsigned long membase; + char *e; + + if (*s == ',') + s++; + + if (!strncmp(s, "0x", 2)) { + /* NB: only 32-bit addresses are supported. */ + membase = simple_strtoul(s, &e, 16); + early_serial_base = (unsigned long)early_ioremap(membase, PAGE_SIZE); + + static_call_update(serial_in, mem32_serial_in); + static_call_update(serial_out, mem32_serial_out); + + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + if (!strncmp(s, "nocfg", 5)) { + baudrate = 0; + } else { + baudrate = simple_strtoul(s, &e, 0); + if (baudrate == 0 || s == e) + baudrate = DEFAULT_BAUD; + } + + if (baudrate) + early_serial_hw_init(115200 / baudrate); +} + +#ifdef CONFIG_PCI +/* * early_pci_serial_init() * * This function is invoked when the early_printk param starts with "pciserial" @@ -278,18 +323,19 @@ static __init void early_pci_serial_init(char *s) */ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) { /* it is IO mapped */ - serial_in = io_serial_in; - serial_out = io_serial_out; early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK; write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_IO); } else { /* It is memory mapped - assume 32-bit alignment */ - serial_in = mem32_serial_in; - serial_out = mem32_serial_out; + static_call_update(serial_in, mem32_serial_in); + static_call_update(serial_out, mem32_serial_out); /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK; +#endif write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_MEMORY); } @@ -352,6 +398,11 @@ static int __init setup_early_printk(char *buf) keep = (strstr(buf, "keep") != NULL); while (*buf != '\0') { + if (!strncmp(buf, "mmio32", 6)) { + buf += 6; + early_mmio_serial_init(buf); + early_console_register(&early_serial_console, keep); + } if (!strncmp(buf, "serial", 6)) { buf += 6; early_serial_init(buf); @@ -365,9 +416,9 @@ static int __init setup_early_printk(char *buf) } #ifdef CONFIG_PCI if (!strncmp(buf, "pciserial", 9)) { - early_pci_serial_init(buf + 9); + buf += 9; /* Keep from match the above "pciserial" */ + early_pci_serial_init(buf); early_console_register(&early_serial_console, keep); - buf += 9; /* Keep from match the above "serial" */ } #endif if (!strncmp(buf, "vga", 3) && diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index e963344b0449..9535a6507db7 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -2,6 +2,7 @@ /* * EISA specific code */ +#include <linux/cc_platform.h> #include <linux/ioport.h> #include <linux/eisa.h> #include <linux/io.h> @@ -10,15 +11,15 @@ static __init int eisa_bus_probe(void) { - void __iomem *p; + u32 *p; - if (xen_pv_domain() && !xen_initial_domain()) + if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) return 0; - p = ioremap(0x0FFFD9, 4); - if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) + p = memremap(0x0FFFD9, 4, MEMREMAP_WB); + if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24)) EISA_bus = 1; - iounmap(p); + memunmap(p); return 0; } subsys_initcall(eisa_bus_probe); diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index f6d856bd50bc..10d0a720659c 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -53,7 +53,7 @@ static inline void fpregs_activate(struct fpu *fpu) /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) @@ -67,7 +67,7 @@ static inline void fpregs_restore_userregs(void) * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. + * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 520deb411a70..ea138583dd92 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -11,6 +11,7 @@ #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/irq_regs.h> @@ -43,14 +44,27 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* Track in-kernel FPU usage */ -static DEFINE_PER_CPU(bool, in_kernel_fpu); +/* + * Track FPU initialization and kernel-mode usage. 'true' means the FPU is + * initialized and is not currently being used by the kernel: + */ +DEFINE_PER_CPU(bool, kernel_fpu_allowed); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); +#ifdef CONFIG_X86_DEBUG_FPU +struct fpu *x86_task_fpu(struct task_struct *task) +{ + if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) + return NULL; + + return (void *)task + sizeof(*task); +} +#endif + /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? @@ -60,8 +74,18 @@ bool irq_fpu_usable(void) if (WARN_ON_ONCE(in_nmi())) return false; - /* In kernel FPU usage already active? */ - if (this_cpu_read(in_kernel_fpu)) + /* + * Return false in the following cases: + * + * - FPU is not yet initialized. This can happen only when the call is + * coming from CPU onlining, for example for microcode checksumming. + * - The kernel is already using the FPU, either because of explicit + * nesting (which should never be done), or because of implicit + * nesting when a hardirq interrupted a kernel-mode FPU section. + * + * The single boolean check below handles both cases: + */ + if (!this_cpu_read(kernel_fpu_allowed)) return false; /* @@ -145,8 +169,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) asm volatile( "fnclex\n\t" "emms\n\t" - "fildl %P[addr]" /* set F?P to defined value */ - : : [addr] "m" (fpstate)); + "fildl %[addr]" /* set F?P to defined value */ + : : [addr] "m" (*fpstate)); } if (use_xsave()) { @@ -195,7 +219,7 @@ void fpu_reset_from_exception_fixup(void) #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); -static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +static void fpu_lock_guest_permissions(void) { struct fpu_state_perm *fpuperm; u64 perm; @@ -204,15 +228,13 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu) return; spin_lock_irq(¤t->sighand->siglock); - fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + fpuperm = &x86_task_fpu(current->group_leader)->guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); - - gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) @@ -220,7 +242,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) struct fpstate *fpstate; unsigned int size; - size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); + size = fpu_kernel_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; @@ -232,8 +254,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) fpstate->is_guest = true; gfpu->fpstate = fpstate; - gfpu->xfeatures = fpu_user_cfg.default_features; - gfpu->perm = fpu_user_cfg.default_features; + gfpu->xfeatures = fpu_kernel_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -248,7 +269,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; - fpu_init_guest_permissions(gfpu); + fpu_lock_guest_permissions(); return true; } @@ -256,16 +277,16 @@ EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { - struct fpstate *fps = gfpu->fpstate; + struct fpstate *fpstate = gfpu->fpstate; - if (!fps) + if (!fpstate) return; - if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use)) return; gfpu->fpstate = NULL; - vfree(fps); + vfree(fpstate); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); @@ -316,12 +337,12 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); */ void fpu_sync_guest_vmexit_xfd_state(void) { - struct fpstate *fps = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { - rdmsrl(MSR_IA32_XFD, fps->xfd); - __this_cpu_write(xfd_state, fps->xfd); + rdmsrq(MSR_IA32_XFD, fpstate->xfd); + __this_cpu_write(xfd_state, fpstate->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); @@ -330,7 +351,7 @@ EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); @@ -420,17 +441,19 @@ EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { - preempt_disable(); + if (!irqs_disabled()) + fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); - WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); + /* Toggle kernel_fpu_allowed to false: */ + WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, false); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - save_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(x86_task_fpu(current)); } __cpu_invalidate_fpregs_state(); @@ -445,10 +468,12 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + /* Toggle kernel_fpu_allowed back to true: */ + WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, true); - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); + if (!irqs_disabled()) + fpregs_unlock(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); @@ -458,7 +483,7 @@ EXPORT_SYMBOL_GPL(kernel_fpu_end); */ void fpu_sync_fpstate(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); trace_x86_fpu_before_save(fpu); @@ -499,7 +524,7 @@ static inline void fpstate_init_fstate(struct fpstate *fpstate) /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems - * 2) fpu_init_fpstate_user() which is invoked from KVM + * 2) fpu_alloc_guest_fpstate() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { @@ -543,7 +568,7 @@ void fpstate_reset(struct fpu *fpu) static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { - struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + struct fpu *src_fpu = x86_task_fpu(current->group_leader); spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ @@ -563,7 +588,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) if (!ssp) return 0; - xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave, XFEATURE_CET_USER); /* @@ -584,8 +609,16 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { - struct fpu *src_fpu = ¤t->thread.fpu; - struct fpu *dst_fpu = &dst->thread.fpu; + /* + * We allocate the new FPU structure right after the end of the task struct. + * task allocation size already took this into account. + * + * This is safe because task_struct size is a multiple of cacheline size, + * thus x86_task_fpu() will always be cacheline aligned as well. + */ + struct fpu *dst_fpu = (void *)dst + sizeof(*dst); + + BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -648,19 +681,22 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, if (update_fpu_shstk(dst, ssp)) return 1; - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* - * Whitelist the FPU register state embedded into task_struct for hardened - * usercopy. + * While struct fpu is no longer part of struct thread_struct, it is still + * allocated after struct task_struct in the "task_struct" kmem cache. But + * since FPU is expected to be part of struct thread_struct, we have to + * adjust for it here. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + /* The allocation follows struct task_struct. */ + *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread); + *offset += offsetof(struct fpu, __fpstate.regs); *size = fpu_kernel_cfg.default_size; } @@ -673,11 +709,18 @@ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) * a state-restore is coming: either an explicit one, * or a reschedule. */ -void fpu__drop(struct fpu *fpu) +void fpu__drop(struct task_struct *tsk) { + struct fpu *fpu; + + if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD)) + return; + + fpu = x86_task_fpu(tsk); + preempt_disable(); - if (fpu == ¤t->thread.fpu) { + if (fpu == x86_task_fpu(current)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -709,9 +752,9 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpregs(void) +static void fpu_reset_fpstate_regs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); @@ -740,11 +783,11 @@ static void fpu_reset_fpregs(void) */ void fpu__clear_user_states(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpregs(); + fpu_reset_fpstate_regs(); fpregs_unlock(); return; } @@ -773,8 +816,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpstate_reset(¤t->thread.fpu); - fpu_reset_fpregs(); + fpstate_reset(x86_task_fpu(current)); + fpu_reset_fpstate_regs(); } /* * Load FPU context before returning to userspace. @@ -814,7 +857,7 @@ void fpregs_lock_and_load(void) */ void fpregs_assert_state_consistent(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; @@ -826,7 +869,7 @@ EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); void fpregs_mark_activate(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 998a08f17e33..99db41bf9fa6 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + ; else #endif asm volatile ("fninit"); @@ -51,6 +51,9 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); + + /* Start allowing kernel-mode FPU: */ + this_cpu_write(kernel_fpu_allowed, true); } static bool __init fpu__probe_without_cpuid(void) @@ -73,6 +76,8 @@ static bool __init fpu__probe_without_cpuid(void) static void __init fpu__init_system_early_generic(void) { + set_thread_flag(TIF_NEED_FPU_LOAD); + if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { if (fpu__probe_without_cpuid()) @@ -94,7 +99,6 @@ static void __init fpu__init_system_early_generic(void) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; -EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { @@ -150,11 +154,13 @@ static void __init fpu__init_task_struct_size(void) { int task_size = sizeof(struct task_struct); + task_size += sizeof(struct fpu); + /* * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(current->thread.fpu.__fpstate.regs); + task_size -= sizeof(union fpregs_state); /* * Add back the dynamically-calculated register state @@ -164,14 +170,9 @@ static void __init fpu__init_task_struct_size(void) /* * We dynamically size 'struct fpu', so we require that - * it be at the end of 'thread_struct' and that - * 'thread_struct' be at the end of 'task_struct'. If - * you hit a compile error here, check the structure to - * see if something got added to the end. + * 'state' be at the end of 'it: */ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); - CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); - CHECK_MEMBER_AT_END_OF(struct task_struct, thread); arch_task_struct_size = task_size; } @@ -204,7 +205,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; - fpstate_reset(¤t->thread.fpu); } /* @@ -213,7 +213,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(void) { - fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(); /* diff --git a/arch/x86/kernel/fpu/internal.h b/arch/x86/kernel/fpu/internal.h index dbdb31f55fc7..975de070c9c9 100644 --- a/arch/x86/kernel/fpu/internal.h +++ b/arch/x86/kernel/fpu/internal.h @@ -18,7 +18,7 @@ static __always_inline __pure bool use_fxsr(void) #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else -# define WARN_ON_FPU(x) ({ (void)(x); 0; }) +# define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 6bc1eb2a21bd..0986c2200adc 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -45,7 +45,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r */ static void sync_fpstate(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) + if (fpu == x86_task_fpu(current)) fpu_sync_fpstate(fpu); } @@ -63,7 +63,7 @@ static void fpu_force_restore(struct fpu *fpu) * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); + WARN_ON_FPU(fpu == x86_task_fpu(current)); __fpu_invalidate_fpregs_state(fpu); } @@ -71,7 +71,7 @@ static void fpu_force_restore(struct fpu *fpu) int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -91,7 +91,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct fxregs_state newstate; int ret; @@ -133,7 +133,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - sync_fpstate(&target->thread.fpu); + sync_fpstate(x86_task_fpu(target)); copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; @@ -143,7 +143,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *tmpbuf = NULL; int ret; @@ -187,10 +187,11 @@ int ssp_active(struct task_struct *target, const struct user_regset *regset) int ssp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct cet_user_state *cetregs; - if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || + !ssp_active(target, regset)) return -ENODEV; sync_fpstate(fpu); @@ -213,7 +214,7 @@ int ssp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *xsave = &fpu->fpstate->regs.xsave; struct cet_user_state *cetregs; unsigned long user_ssp; @@ -367,7 +368,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); + __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -400,7 +401,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave, int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; @@ -432,7 +433,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; int ret; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 247f2225aa9f..c3ec2512f2bb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -27,19 +27,14 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, struct _fpx_sw_bytes *fx_sw) { - int min_xstate_size = sizeof(struct fxregs_state) + - sizeof(struct xstate_header); void __user *fpstate = fxbuf; unsigned int magic2; if (__copy_from_user(fx_sw, &fxbuf->sw_reserved[0], sizeof(*fx_sw))) return false; - /* Check for the first magic field and other error scenarios. */ - if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || - fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > current->thread.fpu.fpstate->user_size || - fx_sw->xstate_size > fx_sw->extended_size) + /* Check for the first magic field */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1) goto setfx; /* @@ -48,13 +43,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: - trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + trace_x86_fpu_xstate_check_failed(x86_task_fpu(current)); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; @@ -69,13 +64,13 @@ setfx: static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; + struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); + fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -119,7 +114,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; - u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ @@ -133,12 +127,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, (__u32 __user *)(buf + fpstate->user_size)); /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - - /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to @@ -149,17 +137,16 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XFEATURE_MASK_FPSSE; - - err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE); return !err; } -static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) +static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru) { if (use_xsave()) - return xsave_to_user_sigframe(buf); + return xsave_to_user_sigframe(buf, pkru); + if (use_fxsr()) return fxsave_to_user_sigframe((struct fxregs_state __user *) buf); else @@ -185,10 +172,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * For [f]xsave state, update the SW reserved fields in the [f]xsave frame * indicating the absence/presence of the extended state to the user. */ -bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) +bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; - struct fpstate *fpstate = tsk->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; @@ -228,7 +215,7 @@ retry: fpregs_restore_userregs(); pagefault_disable(); - ret = copy_fpregs_to_sigframe(buf_fx); + ret = copy_fpregs_to_sigframe(buf_fx, pkru); pagefault_enable(); fpregs_unlock(); @@ -276,7 +263,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int ret; /* Restore enabled features only. */ @@ -336,7 +323,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; + struct fpu *fpu = x86_task_fpu(tsk); struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; @@ -456,7 +443,7 @@ static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; @@ -503,7 +490,7 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); + unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 33a214b1a4ce..9aa9ac8399ae 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -13,16 +13,22 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> +#include <linux/coredump.h> +#include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> +#include <asm/cpuid/api.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> +#include <uapi/asm/elf.h> + #include "context.h" #include "internal.h" #include "legacy.h" @@ -58,6 +64,7 @@ static const char *xfeature_names[] = "unknown xstate feature", "AMX Tile config", "AMX Tile data", + "APX registers", "unknown xstate feature", }; @@ -76,6 +83,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, + [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -84,6 +92,31 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; +/* + * Ordering of xstate components in uncompacted format: The xfeature + * number does not necessarily indicate its position in the XSAVE buffer. + * This array defines the traversal order of xstate features. + */ +static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; + +static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) +{ + for (; xfeature_uncompact_order[i] != -1; i++) { + if (mask & BIT_ULL(xfeature_uncompact_order[i])) + break; + } + + return i; +} + +/* Iterate xstate features in uncompacted order: */ +#define for_each_extended_xfeature_in_order(i, mask) \ + for (i = 0; \ + i = next_xfeature_order(i, mask), \ + xfeature_uncompact_order[i] != -1; \ + i++) + #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) @@ -195,7 +228,7 @@ void fpu__init_cpu_xstate(void) * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } @@ -205,16 +238,20 @@ static bool xfeature_enabled(enum xfeature xfeature) return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } +static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) +{ + return xstate_offsets[*(unsigned int *)xfeature1] - + xstate_offsets[*(unsigned int *)xfeature2]; +} + /* * Record the offsets and sizes of various xstates contained - * in the XSAVE state memory layout. + * in the XSAVE state memory layout. Also, create an ordered + * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { - u32 eax, ebx, ecx, edx, i; - /* start at the beginning of the "extended state" */ - unsigned int last_good_offset = offsetof(struct xregs_state, - extended_state_area); + u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form @@ -228,39 +265,30 @@ static void __init setup_xstate_cache(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { + cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); - xstate_sizes[i] = eax; - xstate_flags[i] = ecx; + xstate_sizes[xfeature] = eax; + xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ - if (xfeature_is_supervisor(i)) + if (xfeature_is_supervisor(xfeature)) continue; - xstate_offsets[i] = ebx; - - /* - * In our xstate size checks, we assume that the highest-numbered - * xstate feature has the highest offset in the buffer. Ensure - * it does. - */ - WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); + xstate_offsets[xfeature] = ebx; - last_good_offset = xstate_offsets[i]; + /* Populate the list of xfeatures before sorting */ + xfeature_uncompact_order[i++] = xfeature; } -} - -static void __init print_xstate_feature(u64 xstate_mask) -{ - const char *feature_name; - if (cpu_has_xfeatures(xstate_mask, &feature_name)) - pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); + /* + * Sort xfeatures by their offsets to support out-of-order + * offsets in the uncompacted format. + */ + sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* @@ -268,19 +296,15 @@ static void __init print_xstate_feature(u64 xstate_mask) */ static void __init print_xstate_features(void) { - print_xstate_feature(XFEATURE_MASK_FP); - print_xstate_feature(XFEATURE_MASK_SSE); - print_xstate_feature(XFEATURE_MASK_YMM); - print_xstate_feature(XFEATURE_MASK_BNDREGS); - print_xstate_feature(XFEATURE_MASK_BNDCSR); - print_xstate_feature(XFEATURE_MASK_OPMASK); - print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); - print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); - print_xstate_feature(XFEATURE_MASK_PKRU); - print_xstate_feature(XFEATURE_MASK_PASID); - print_xstate_feature(XFEATURE_MASK_CET_USER); - print_xstate_feature(XFEATURE_MASK_XTILE_CFG); - print_xstate_feature(XFEATURE_MASK_XTILE_DATA); + int i; + + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = BIT_ULL(i); + const char *name; + + if (cpu_has_xfeatures(mask, &name)) + pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); + } } /* @@ -348,7 +372,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * setup the xstate image representing the init state @@ -395,7 +420,7 @@ int xfeature_size(int xfeature_nr) u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } @@ -438,9 +463,9 @@ static void __init __xstate_dump_leaves(void) * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", - XSTATE_CPUID, i, eax, ebx, ecx, edx); + CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } @@ -481,7 +506,7 @@ static int __init check_xtile_data_against_struct(int size) * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ - cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of @@ -495,7 +520,7 @@ static int __init check_xtile_data_against_struct(int size) * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ - cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); + cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; @@ -548,6 +573,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); @@ -560,13 +586,20 @@ static bool __init check_xstate_against_struct(int nr) static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; - unsigned int offset = xstate_offsets[topmost]; + unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); - if (compacted) + if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); + } else { + /* Walk through the xfeature order to pick the last */ + for_each_extended_xfeature_in_order(i, xfeatures) + topmost = xfeature_uncompact_order[i]; + offset = xstate_offsets[topmost]; + } + return offset + xstate_sizes[topmost]; } @@ -630,7 +663,7 @@ static unsigned int __init get_compacted_size(void) * are no supervisor states, but XSAVEC still uses compacted * format. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } @@ -647,7 +680,7 @@ static unsigned int __init get_xsave_compacted_size(void) return get_compacted_size(); /* Disable independent features. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. @@ -656,7 +689,7 @@ static unsigned int __init get_xsave_compacted_size(void) size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } @@ -671,7 +704,7 @@ static unsigned int __init get_xsave_size_user(void) * containing all the *user* state components * corresponding to bits currently set in XCR0. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } @@ -719,6 +752,8 @@ static int __init init_xstate_size(void) */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { + pr_info("x86/fpu: XSAVE disabled\n"); + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); @@ -735,7 +770,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = 0; - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); } /* @@ -760,21 +795,16 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) return; } - if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { - WARN_ON_FPU(1); - return; - } - /* * Find user xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { @@ -788,6 +818,20 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && + fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { + /* + * This is a problematic CPU configuration where two + * conflicting state components are both enumerated. + */ + pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", + fpu_kernel_cfg.max_features); + goto out_disable; + } + + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & + XFEATURE_MASK_INDEPENDENT; + /* * Clear XSAVE features that are disabled in the normal CPUID. */ @@ -844,9 +888,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (err) goto out_disable; - /* Reset the state for the current task */ - fpstate_reset(¤t->thread.fpu); - /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: @@ -862,7 +903,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { - pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } @@ -874,7 +915,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { - pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } @@ -914,12 +955,12 @@ void fpu__resume_cpu(void) * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) - wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); + wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* @@ -991,6 +1032,20 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) return __raw_xsave_addr(xsave, xfeature_nr); } +EXPORT_SYMBOL_GPL(get_xsave_addr); + +/* + * Given an xstate feature nr, calculate where in the xsave buffer the state is. + * The xsave buffer should be in standard format, not compacted (e.g. user mode + * signal frames). + */ +void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) +{ + if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) + return NULL; + + return (void __user *)xsave + xstate_offsets[xfeature_nr]; +} #ifdef CONFIG_ARCH_HAS_PKEYS @@ -1067,10 +1122,9 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; + unsigned int zerofrom, i, xfeature; struct xstate_header header; - unsigned int zerofrom; u64 mask; - int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; @@ -1139,15 +1193,16 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = header.xfeatures; - for_each_extended_xfeature(i, mask) { + for_each_extended_xfeature_in_order(i, mask) { + xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ - if (zerofrom < xstate_offsets[i]) - membuf_zero(&to, xstate_offsets[i] - zerofrom); + if (zerofrom < xstate_offsets[xfeature]) + membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); - if (i == XFEATURE_PKRU) { + if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the @@ -1157,14 +1212,14 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, - __raw_xsave_addr(xsave, i), - xstate_sizes[i]); + __raw_xsave_addr(xsave, xfeature), + xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ - zerofrom = xstate_offsets[i] + xstate_sizes[i]; + zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: @@ -1187,8 +1242,8 @@ out: void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, - tsk->thread.fpu.fpstate->user_xfeatures, + __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, + x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1328,7 +1383,7 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); + return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) @@ -1394,9 +1449,9 @@ void xrstors(struct xregs_state *xstate, u64 mask) } #if IS_ENABLED(CONFIG_KVM) -void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { - void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); @@ -1422,7 +1477,7 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ - if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* @@ -1434,8 +1489,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) return rstor; /* - * XSAVE(S): clone(), fpu_swap_kvm_fpu() - * XRSTORS(S): fpu_swap_kvm_fpu() + * XSAVE(S): clone(), fpu_swap_kvm_fpstate() + * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* @@ -1499,7 +1554,7 @@ void fpstate_free(struct fpu *fpu) static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; @@ -1592,7 +1647,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; @@ -1602,16 +1657,20 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) if ((permitted & requested) == requested) return 0; - /* Calculate the resulting kernel state size */ + /* + * Calculate the resulting kernel state size. Note, @permitted also + * contains supervisor xfeatures even though supervisor are always + * permitted for kernel and guest FPUs, and never permitted for user + * FPUs. + */ mask = permitted | requested; - /* Take supervisor states into account on the host */ - if (!guest) - mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); - /* Calculate the resulting user state size */ - mask &= XFEATURE_MASK_USER_SUPPORTED; - usize = xstate_calculate_size(mask, false); + /* + * Calculate the resulting user state size. Take care not to clobber + * the supervisor xfeatures in the new mask! + */ + usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); @@ -1695,7 +1754,7 @@ int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) return -EPERM; } - fpu = ¤t->group_leader->thread.fpu; + fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; @@ -1800,7 +1859,7 @@ long fpu_xstate_prctl(int option, unsigned long arg2) */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); long delta; if (!timestamp) { @@ -1837,3 +1896,89 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ + +#ifdef CONFIG_COREDUMP +static const char owner_name[] = "LINUX"; + +/* + * Dump type, size, offset and flag values for every xfeature that is present. + */ +static int dump_xsave_layout_desc(struct coredump_params *cprm) +{ + int num_records = 0; + int i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) { + struct x86_xfeat_component xc = { + .type = i, + .size = xstate_sizes[i], + .offset = xstate_offsets[i], + /* reserved for future use */ + .flags = 0, + }; + + if (!dump_emit(cprm, &xc, sizeof(xc))) + return 0; + + num_records++; + } + return num_records; +} + +static u32 get_xsave_desc_size(void) +{ + u32 cnt = 0; + u32 i; + + for_each_extended_xfeature(i, fpu_user_cfg.max_features) + cnt++; + + return cnt * (sizeof(struct x86_xfeat_component)); +} + +int elf_coredump_extra_notes_write(struct coredump_params *cprm) +{ + int num_records = 0; + struct elf_note en; + + if (!fpu_user_cfg.max_features) + return 0; + + en.n_namesz = sizeof(owner_name); + en.n_descsz = get_xsave_desc_size(); + en.n_type = NT_X86_XSAVE_LAYOUT; + + if (!dump_emit(cprm, &en, sizeof(en))) + return 1; + if (!dump_emit(cprm, owner_name, en.n_namesz)) + return 1; + if (!dump_align(cprm, 4)) + return 1; + + num_records = dump_xsave_layout_desc(cprm); + if (!num_records) + return 1; + + /* Total size should be equal to the number of records */ + if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) + return 1; + + return 0; +} + +int elf_coredump_extra_notes_size(void) +{ + int size; + + if (!fpu_user_cfg.max_features) + return 0; + + /* .note header */ + size = sizeof(struct elf_note); + /* Name plus alignment to 4 bytes */ + size += roundup(sizeof(owner_name), 4); + size += get_xsave_desc_size(); + + return size; +} +#endif /* CONFIG_COREDUMP */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 19ca623ffa2a..52ce19289989 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -5,6 +5,7 @@ #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); @@ -22,7 +23,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) static inline u64 xstate_get_group_perm(bool guest) { - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ @@ -54,7 +55,7 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void extern void fpu__init_cpu_xstate(void); extern void fpu__init_system_xstate(unsigned int legacy_size); -extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); +extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr); static inline u64 xfeatures_mask_supervisor(void) { @@ -64,38 +65,73 @@ static inline u64 xfeatures_mask_supervisor(void) static inline u64 xfeatures_mask_independent(void) { if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) - return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR; + return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR; - return XFEATURE_MASK_INDEPENDENT; + return fpu_kernel_cfg.independent_features; +} + +static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask) +{ + u64 xfeatures; + int err; + + /* Read the xfeatures value already saved in the user buffer */ + err = __get_user(xfeatures, &xbuf->header.xfeatures); + xfeatures |= mask; + err |= __put_user(xfeatures, &xbuf->header.xfeatures); + + return err; +} + +/* + * Update the value of PKRU register that was already pushed onto the signal frame. + */ +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) +{ + int err; + + if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) + return 0; + + /* Mark PKRU as in-use so that it is restored correctly. */ + err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU); + if (err) + return err; + + /* Update PKRU value in the userspace xsave buffer. */ + return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU)); } /* XSAVE/XRSTOR wrapper functions */ #ifdef CONFIG_X86_64 -#define REX_PREFIX "0x48, " +#define REX_SUFFIX "64" #else -#define REX_PREFIX +#define REX_SUFFIX #endif -/* These macros all use (%edi)/(%rdi) as the single memory argument. */ -#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" -#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" -#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" -#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" -#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" -#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" +#define XSAVE "xsave" REX_SUFFIX " %[xa]" +#define XSAVEOPT "xsaveopt" REX_SUFFIX " %[xa]" +#define XSAVEC "xsavec" REX_SUFFIX " %[xa]" +#define XSAVES "xsaves" REX_SUFFIX " %[xa]" +#define XRSTOR "xrstor" REX_SUFFIX " %[xa]" +#define XRSTORS "xrstors" REX_SUFFIX " %[xa]" /* * After this @err contains 0 on success or the trap number when the * operation raises an exception. + * + * The [xa] input parameter below represents the struct xregs_state pointer + * and the asm symbolic name for the argument used in the XSAVE/XRSTOR insns + * above. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ - "2:\n\t" \ + "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* @@ -108,23 +144,19 @@ static inline u64 xfeatures_mask_independent(void) * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * - * We use XSAVE as a fallback. - * - * The 661 label is defined in the ALTERNATIVE* macros as the address of the - * original instruction which gets replaced. We need to use it here as the - * address of the instruction where we might get an exception at. + * Use XSAVE as a fallback. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_3(XSAVE, \ + asm volatile("1: " ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ - "\n" \ + "\n\t" \ "xor %[err], %[err]\n" \ "3:\n" \ - _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ + _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err) \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") /* @@ -132,13 +164,13 @@ static inline u64 xfeatures_mask_independent(void) * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ - asm volatile(ALTERNATIVE(XRSTOR, \ + asm volatile("1: " ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ - _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \ + _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \ : \ - : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ + : [xa] "m" (*(st)), "a" (lmask), "d" (hmask) \ : "memory") #if defined(CONFIG_X86_64) && defined(CONFIG_X86_DEBUG_FPU) @@ -150,7 +182,7 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { - wrmsrl(MSR_IA32_XFD, xfd); + wrmsrq(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } @@ -260,14 +292,14 @@ static inline u64 xfeatures_need_sigframe_write(void) * The caller has to zero buf::header before calling this because XSAVE* * does not touch the reserved fields in the header. */ -static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) +static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru) { /* * Include the features which are not xsaved/rstored by the kernel * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - struct fpstate *fpstate = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; @@ -285,6 +317,9 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf) XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); + if (!err) + err = update_pkru_in_sigframe(buf, pkru); + return err; } @@ -298,7 +333,7 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 u32 hmask = mask >> 32; int err; - xfd_validate_state(current->thread.fpu.fpstate, mask, true); + xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 4bcd8791ad96..816187da3a47 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -3,6 +3,7 @@ #include <asm/desc.h> #include <asm/fred.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/traps.h> @@ -21,32 +22,45 @@ #define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector))) +DEFINE_PER_CPU(unsigned long, fred_rsp0); +EXPORT_PER_CPU_SYMBOL(fred_rsp0); + void cpu_init_fred_exceptions(void) { /* When FRED is enabled by default, remove this log message */ pr_info("Initialize FRED on CPU%d\n", smp_processor_id()); - wrmsrl(MSR_IA32_FRED_CONFIG, + /* + * If a kernel event is delivered before a CPU goes to user level for + * the first time, its SS is NULL thus NULL is pushed into the SS field + * of the FRED stack frame. But before ERETS is executed, the CPU may + * context switch to another task and go to user level. Then when the + * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later + * when ERETS is executed to return from the kernel event handler, a #GP + * fault is generated because SS doesn't match the SS saved in the FRED + * stack frame. + * + * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs. + */ + loadsegment(ss, __KERNEL_DS); + + wrmsrq(MSR_IA32_FRED_CONFIG, /* Reserve for CALL emulation */ FRED_CONFIG_REDZONE | FRED_CONFIG_INT_STKLVL(0) | FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); + wrmsrq(MSR_IA32_FRED_STKLVLS, 0); + /* - * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* - * (remember that user space faults are always taken on stack level 0) - * is to avoid overflowing the kernel stack. + * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be + * resynchronized with its per-CPU cache. */ - wrmsrl(MSR_IA32_FRED_STKLVLS, - FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | - FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | - FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | - FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); - /* The FRED equivalents to IST stacks... */ - wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); - wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); - wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + wrmsrq(MSR_IA32_FRED_RSP1, 0); + wrmsrq(MSR_IA32_FRED_RSP2, 0); + wrmsrq(MSR_IA32_FRED_RSP3, 0); /* Enable FRED */ cr4_set_bits(X86_CR4_FRED); @@ -57,3 +71,23 @@ void cpu_init_fred_exceptions(void) setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } + +/* Must be called after setup_cpu_entry_areas() */ +void cpu_init_fred_rsps(void) +{ + /* + * The purpose of separate stacks for NMI, #DB and #MC *in the kernel* + * (remember that user space faults are always taken on stack level 0) + * is to avoid overflowing the kernel stack. + */ + wrmsrq(MSR_IA32_FRED_STKLVLS, + FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | + FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); + + /* The FRED equivalents to IST stacks... */ + wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 70139d9d2e01..252e82bcfd2f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -25,6 +25,7 @@ #include <linux/memory.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> +#include <linux/execmem.h> #include <trace/syscall.h> @@ -54,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -118,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -185,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -246,10 +247,10 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) @@ -260,25 +261,14 @@ void arch_ftrace_update_code(int command) /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 -#ifdef CONFIG_MODULES -#include <linux/moduleloader.h> -/* Module allocation simplifies allocating memory for code */ static inline void *alloc_tramp(unsigned long size) { - return module_alloc(size); + return execmem_alloc(EXECMEM_FTRACE, size); } static inline void tramp_free(void *tramp) { - module_memfree(tramp); + execmem_free(tramp); } -#else -/* Trampolines can only be created if modules are supported */ -static inline void *alloc_tramp(unsigned long size) -{ - return NULL; -} -static inline void tramp_free(void *tramp) { } -#endif /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); @@ -364,7 +354,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(ip)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); @@ -502,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -596,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -615,16 +605,8 @@ int ftrace_disable_ftrace_graph_caller(void) } #endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ -/* - * Hook the return address and push it in the stack of return addrs - * in current thread info. - */ -void prepare_ftrace_return(unsigned long ip, unsigned long *parent, - unsigned long frame_pointer) +static inline bool skip_ftrace_return(void) { - unsigned long return_hooker = (unsigned long)&return_to_handler; - int bit; - /* * When resuming from suspend-to-ram, this function can be indirectly * called from early CPU startup code while the CPU is in real mode, @@ -634,33 +616,48 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, * This check isn't as accurate as virt_addr_valid(), but it should be * good enough for this purpose, and it's fast. */ - if (unlikely((long)__builtin_frame_address(0) >= 0)) - return; + if ((long)__builtin_frame_address(0) >= 0) + return true; - if (unlikely(ftrace_graph_is_dead())) - return; + if (ftrace_graph_is_dead()) + return true; - if (unlikely(atomic_read(¤t->tracing_graph_pause))) - return; + if (atomic_read(¤t->tracing_graph_pause)) + return true; + return false; +} - bit = ftrace_test_recursion_trylock(ip, *parent); - if (bit < 0) +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + + if (unlikely(skip_ftrace_return())) return; if (!function_graph_enter(*parent, ip, frame_pointer, parent)) *parent = return_hooker; - - ftrace_test_recursion_unlock(bit); } #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { - struct pt_regs *regs = &fregs->regs; + struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs; unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long *parent = (unsigned long *)stack; - prepare_ftrace_return(ip, (unsigned long *)stack, 0); + if (unlikely(skip_ftrace_return())) + return; + + + if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs)) + *parent = return_hooker; } #endif diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 58d9ed50fe61..f4e0c3361234 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -187,14 +187,15 @@ SYM_CODE_END(ftrace_graph_caller) .globl return_to_handler return_to_handler: - pushl $0 - pushl %edx - pushl %eax + subl $(PTREGS_SIZE), %esp + movl $0, PT_EBP(%esp) + movl %edx, PT_EDX(%esp) + movl %eax, PT_EAX(%esp) movl %esp, %eax call ftrace_return_to_handler movl %eax, %ecx - popl %eax - popl %edx - addl $4, %esp # skip ebp + movl PT_EAX(%esp), %eax + movl PT_EDX(%esp), %edx + addl $(PTREGS_SIZE), %esp JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 214f30e9f0c0..367da3638167 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -146,12 +146,14 @@ SYM_FUNC_END(ftrace_stub_graph) #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) SYM_FUNC_START(ftrace_caller) + ANNOTATE_NOENDBR /* save_mcount_regs fills in first two parameters */ save_mcount_regs @@ -197,6 +199,7 @@ SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) SYM_FUNC_START(ftrace_regs_caller) + ANNOTATE_NOENDBR /* Save the current flags before any operations that can change them */ pushfq @@ -310,6 +313,7 @@ SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(ftrace_stub_direct_tramp) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT RET SYM_FUNC_END(ftrace_stub_direct_tramp) @@ -317,6 +321,7 @@ SYM_FUNC_END(ftrace_stub_direct_tramp) #else /* ! CONFIG_DYNAMIC_FTRACE */ SYM_FUNC_START(__fentry__) + ANNOTATE_NOENDBR CALL_DEPTH_ACCOUNT cmpq $ftrace_stub, ftrace_trace_function @@ -348,21 +353,22 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__) SYM_CODE_START(return_to_handler) UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - subq $24, %rsp - /* Save the return values */ - movq %rax, (%rsp) - movq %rdx, 8(%rsp) - movq %rbp, 16(%rsp) + /* Save ftrace_regs for function exit context */ + subq $(FRAME_SIZE), %rsp + + movq %rax, RAX(%rsp) + movq %rdx, RDX(%rsp) + movq %rbp, RBP(%rsp) movq %rsp, %rdi call ftrace_return_to_handler movq %rax, %rdi - movq 8(%rsp), %rdx - movq (%rsp), %rax + movq RDX(%rsp), %rdx + movq RAX(%rsp), %rax - addq $24, %rsp + addq $(FRAME_SIZE), %rsp /* * Jump back to the old return address. This cannot be JMP_NOSPEC rdi * since IBT would demand that contain ENDBR, which simply isn't so for diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index de001b2146ab..375f2d7f1762 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void) *ptr = (unsigned long)ptep + PAGE_OFFSET; #ifdef CONFIG_MICROCODE_INITRD32 - /* Running on a hypervisor? */ - if (native_cpuid_ecx(1) & BIT(31)) - return; - params = (struct boot_params *)__pa_nodebug(&boot_params); if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image) return; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a817ed0724d1..533fcf5636fc 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -5,8 +5,6 @@ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE */ -#define DISABLE_BRANCH_PROFILING - /* cpu_feature_enabled() cannot be used this early */ #define USE_EARLY_PGTABLE_L5 @@ -49,226 +47,22 @@ * Manage page tables very early on. */ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt; +unsigned int __initdata next_early_pgt; +SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); -#ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); -#endif -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); -#endif - -static inline bool check_la57_support(void) -{ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) - return false; - - /* - * 5-level paging is detected and enabled at kernel decompression - * stage. Only check if it has been enabled there. - */ - if (!(native_read_cr4() & X86_CR4_LA57)) - return false; - - RIP_REL_REF(__pgtable_l5_enabled) = 1; - RIP_REL_REF(pgdir_shift) = 48; - RIP_REL_REF(ptrs_per_p4d) = 512; - RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; - RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; - RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; - - return true; -} - -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd) -{ - unsigned long vaddr, vaddr_end; - int i; - - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (sme_get_me_mask()) { - vaddr = (unsigned long)__start_bss_decrypted; - vaddr_end = (unsigned long)__end_bss_decrypted; - - for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { - /* - * On SNP, transition the page to shared in the RMP table so that - * it is consistent with the page table attribute change. - * - * __start_bss_decrypted has a virtual address in the high range - * mapping (kernel .text). PVALIDATE, by way of - * early_snp_set_memory_shared(), requires a valid virtual - * address but the kernel is currently running off of the identity - * mapping so use __pa() to get a *currently* valid virtual address. - */ - early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD); - - i = pmd_index(vaddr); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -/* Code in __startup_64() can be relocated during execution, but the compiler - * doesn't have to generate PC-relative relocations when accessing globals from - * that function. Clang actually does not generate them, which leads to - * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). - */ -unsigned long __head __startup_64(unsigned long physaddr, - struct boot_params *bp) -{ - pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); - unsigned long pgtable_flags; - unsigned long load_delta; - pgdval_t *pgd; - p4dval_t *p4d; - pudval_t *pud; - pmdval_t *pmd, pmd_entry; - bool la57; - int i; - - la57 = check_la57_support(); - - /* Is the address too large? */ - if (physaddr >> MAX_PHYSMEM_BITS) - for (;;); - - /* - * Compute the delta between the address I am compiled to run at - * and the address I am actually running at. - */ - load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map); - RIP_REL_REF(phys_base) = load_delta; - - /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_MASK) - for (;;); - - /* Include the SME encryption mask in the fixup value */ - load_delta += sme_get_me_mask(); - - /* Fixup the physical addresses in the page table */ - - pgd = &RIP_REL_REF(early_top_pgt)->pgd; - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (la57) { - p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); - p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; - } - - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - - for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ - - pud = &early_pgts[0]->pmd; - pmd = &early_pgts[1]->pmd; - RIP_REL_REF(next_early_pgt) = 2; - - pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - - if (la57) { - p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; - - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; - pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - - i = physaddr >> P4D_SHIFT; - p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - } else { - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + pgtable_flags; - pgd[i + 1] = (pgdval_t)pud + pgtable_flags; - } - - i = physaddr >> PUD_SHIFT; - pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - - pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; - /* Filter out unsupported __PAGE_KERNEL_* bits: */ - pmd_entry &= RIP_REL_REF(__supported_pte_mask); - pmd_entry += sme_get_me_mask(); - pmd_entry += physaddr; - - for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT); - - pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; - } - - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - * - * Only the region occupied by the kernel image has so far - * been checked against the table of usable memory regions - * provided by the firmware, so invalidate pages outside that - * region. A page table entry that maps to a reserved area of - * memory would allow processor speculation into that area, - * and on some hardware (particularly the UV platform) even - * speculative access to some reserved areas is caught as an - * error, causing the BIOS to halt the system. - */ - - pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; - - /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index((unsigned long)_text); i++) - pmd[i] &= ~_PAGE_PRESENT; - - /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index((unsigned long)_end); i++) - if (pmd[i] & _PAGE_PRESENT) - pmd[i] += load_delta; - - /* invalidate pages after the kernel image */ - for (; i < PTRS_PER_PMD; i++) - pmd[i] &= ~_PAGE_PRESENT; - - return sme_postprocess_startup(bp, pmd); -} /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) @@ -443,6 +237,12 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode /* Kill off the identity-map trampoline */ reset_early_page_tables(); + if (pgtable_l5_enabled()) { + page_offset_base = __PAGE_OFFSET_BASE_L5; + vmalloc_base = __VMALLOC_BASE_L5; + vmemmap_base = __VMEMMAP_BASE_L5; + } + clear_bss(); /* @@ -507,41 +307,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) start_kernel(); } -/* - * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is - * used until the idt_table takes over. On the boot CPU this happens in - * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases - * this happens in the functions called from head_64.S. - * - * The idt_table can't be used that early because all the code modifying it is - * in idt.c and can be instrumented by tracing or KASAN, which both don't work - * during early CPU bringup. Also the idt_table has the runtime vectors - * configured which require certain CPU state to be setup already (like TSS), - * which also hasn't happened yet in early CPU bringup. - */ -static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; - -/* This may run while still in the direct mapping */ -static void __head startup_64_load_idt(void *vc_handler) -{ - struct desc_ptr desc = { - .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), - .size = sizeof(bringup_idt_table) - 1, - }; - struct idt_data data; - gate_desc idt_desc; - - /* @vc_handler is set only for a VMM Communication Exception */ - if (vc_handler) { - init_idt_data(&data, X86_TRAP_VC, vc_handler); - idt_init_desc(&idt_desc, &data); - native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); - } - - native_load_idt(&desc); -} - -/* This is used when running on kernel addresses */ void early_setup_idt(void) { void *handler = NULL; @@ -553,29 +318,3 @@ void early_setup_idt(void) startup_64_load_idt(handler); } - -/* - * Setup boot CPU state needed before kernel switches to virtual addresses. - */ -void __head startup_64_setup_gdt_idt(void) -{ - void *handler = NULL; - - struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(init_per_cpu_var(gdt_page.gdt)), - .size = GDT_SIZE - 1, - }; - - /* Load GDT */ - native_load_gdt(&startup_gdt_descr); - - /* New GDT is live - reload data segment registers */ - asm volatile("movl %%eax, %%ds\n" - "movl %%eax, %%ss\n" - "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) - handler = &RIP_REL_REF(vc_no_ghcb); - - startup_64_load_idt(handler); -} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b50f3641c4d6..76743dfad6ab 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -44,9 +44,6 @@ #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id - -#define SIZEOF_PTREGS 17*4 - /* * Worst-case size of the kernel mapping we need to make: * a relocatable kernel can live anywhere in lowmem, so we need to be able @@ -89,7 +86,7 @@ SYM_CODE_START(startup_32) movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx - rep ; stosl + rep stosl /* * Copy bootup parameters out of the way. * Note: %esi still has the pointer to the real-mode data. @@ -101,15 +98,13 @@ SYM_CODE_START(startup_32) movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld - rep - movsl + rep movsl movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No command line movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl + rep movsl 1: #ifdef CONFIG_OLPC @@ -488,19 +483,13 @@ SYM_DATA_END(initial_page_table) .data .balign 4 -/* - * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder - * reliably detect the end of the stack. - */ -SYM_DATA(initial_stack, - .long init_thread_union + THREAD_SIZE - - SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING) +SYM_DATA(initial_stack, .long __top_init_kernel_stack) __INITRODATA int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" /* * The IDT and GDT 'descriptors' are a strange 48-bit object diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d8198fbd70e5..3e9b3a3bd039 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -32,13 +32,6 @@ * We are not able to switch in one step to the final KERNEL ADDRESS SPACE * because we need identity-mapped pages. */ -#define l4_index(x) (((x) >> 39) & 511) -#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) - -L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) -L4_START_KERNEL = l4_index(__START_KERNEL_map) - -L3_START_KERNEL = pud_index(__START_KERNEL_map) __HEAD .code64 @@ -66,13 +59,16 @@ SYM_CODE_START_NOALIGN(startup_64) mov %rsi, %r15 /* Set up the stack for verify_cpu() */ - leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp + leaq __top_init_kernel_stack(%rip), %rsp - /* Setup GSBASE to allow stack canary access for C code */ + /* + * Set up GSBASE. + * Note that on SMP the boot CPU uses the init data section until + * the per-CPU areas are set up. + */ movl $MSR_GS_BASE, %ecx - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx - movl %edx, %eax - shrq $32, %rdx + xorl %eax, %eax + xorl %edx, %edx wrmsr call startup_64_setup_gdt_idt @@ -84,6 +80,7 @@ SYM_CODE_START_NOALIGN(startup_64) lretq .Lon_kernel_cs: + ANNOTATE_NOENDBR UNWIND_HINT_END_OF_STACK #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -101,12 +98,18 @@ SYM_CODE_START_NOALIGN(startup_64) call verify_cpu /* + * Derive the kernel's physical-to-virtual offset from the physical and + * virtual addresses of common_startup_64(). + */ + leaq common_startup_64(%rip), %rdi + subq .Lcommon_startup_64(%rip), %rdi + + /* * Perform pagetable fixups. Additionally, if SME is active, encrypt * the kernel and retrieve the modifier (SME encryption mask if SME * is active) to be added to the initial pgdir entry that will be * programmed into CR3. */ - leaq _text(%rip), %rdi movq %r15, %rsi call __startup_64 @@ -134,11 +137,11 @@ SYM_CODE_START_NOALIGN(startup_64) /* Branch to the common startup code at its kernel virtual address */ ANNOTATE_RETPOLINE_SAFE - jmp *0f(%rip) + jmp *.Lcommon_startup_64(%rip) SYM_CODE_END(startup_64) __INITRODATA -0: .quad common_startup_64 +SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64) .text SYM_CODE_START(secondary_startup_64) @@ -319,7 +322,7 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) * * RDX contains the per-cpu offset */ - movq pcpu_hot + X86_current_task(%rdx), %rax + movq current_task(%rdx), %rax movq TASK_threadsp(%rax), %rsp /* @@ -359,17 +362,12 @@ SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL) movl %eax,%fs movl %eax,%gs - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx -#ifndef CONFIG_SMP - leaq INIT_PER_CPU_VAR(fixed_percpu_data)(%rip), %rdx -#endif movl %edx, %eax shrq $32, %rdx wrmsr @@ -435,7 +433,7 @@ SYM_CODE_START(soft_restart_cpu) UNWIND_HINT_END_OF_STACK /* Find the idle task stack */ - movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx + movq PER_CPU_VAR(current_task), %rcx movq TASK_threadsp(%rcx), %rsp jmp .Ljump_to_C_code @@ -575,11 +573,9 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_no_ghcb) +SYM_PIC_ALIAS(vc_no_ghcb); #endif -#define SYM_DATA_START_PAGE_ALIGNED(name) \ - SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE) - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Each PGD needs to be 8k long and 8k aligned. We do not @@ -601,14 +597,6 @@ SYM_CODE_END(vc_no_ghcb) #define PTI_USER_PGD_FILL 0 #endif -/* Automate the creation of 1 to 1 mapping pmd entries */ -#define PMDS(START, PERM, COUNT) \ - i = 0 ; \ - .rept (COUNT) ; \ - .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ - i = i + 1 ; \ - .endr - __INITDATA .balign 4 @@ -617,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) +SYM_PIC_ALIAS(early_top_pgt) SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 SYM_DATA_END(early_dynamic_pgts) +SYM_PIC_ALIAS(early_dynamic_pgts); SYM_DATA(early_recursion_flag, .long 0) @@ -659,12 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt) SYM_DATA_END(init_top_pgt) #endif -#ifdef CONFIG_X86_5LEVEL SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level4_kernel_pgt) -#endif +SYM_PIC_ALIAS(level4_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -672,6 +661,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level3_kernel_pgt) +SYM_PIC_ALIAS(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* @@ -689,6 +679,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) +SYM_PIC_ALIAS(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 @@ -701,6 +692,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 SYM_DATA_END(level2_fixmap_pgt) +SYM_PIC_ALIAS(level2_fixmap_pgt) SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) @@ -708,8 +700,6 @@ SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .endr SYM_DATA_END(level1_fixmap_pgt) -#undef PMDS - .data .align 16 @@ -718,9 +708,10 @@ SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) +SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) -#include "../../x86/xen/xen-head.S" +#include "../xen/xen-head.S" __PAGE_ALIGNED_BSS SYM_DATA_START_PAGE_ALIGNED(empty_zero_page) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index c96ae8fee95e..d6387dde3ff9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,10 +7,12 @@ #include <linux/cpu.h> #include <linux/irq.h> +#include <asm/cpuid/api.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/mwait.h> +#include <asm/msr.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -516,22 +518,14 @@ static int hpet_msi_init(struct irq_domain *domain, struct msi_domain_info *info, unsigned int virq, irq_hw_number_t hwirq, msi_alloc_info_t *arg) { - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, handle_edge_irq, arg->data, "edge"); return 0; } -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); -} - static struct msi_domain_ops hpet_msi_domain_ops = { .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, }; static struct msi_domain_info hpet_msi_domain_info = { @@ -927,10 +921,7 @@ static bool __init mwait_pc10_supported(void) if (!cpu_feature_enabled(X86_FEATURE_MWAIT)) return false; - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates); return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) && (ecx & CPUID5_ECX_INTERRUPT_BREAK) && @@ -980,7 +971,7 @@ static bool __init hpet_is_pc10_damaged(void) return false; /* Check whether PC10 is enabled in PKG C-state limit */ - rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg); if ((pcfg & 0xF) < 8) return false; @@ -1392,12 +1383,6 @@ int hpet_set_periodic_freq(unsigned long freq) } EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); -int hpet_rtc_dropped_irq(void) -{ - return is_hpet_enabled(); -} -EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); - static void hpet_rtc_timer_reinit(void) { unsigned int delta; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2b7999a1a50a..cb9852ad6098 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -8,6 +8,7 @@ #include <linux/timex.h> #include <linux/i8253.h> +#include <asm/hypervisor.h> #include <asm/apic.h> #include <asm/hpet.h> #include <asm/time.h> @@ -39,9 +40,16 @@ static bool __init use_pit(void) bool __init pit_timer_init(void) { - if (!use_pit()) + if (!use_pit()) { + /* + * Don't just ignore the PIT. Ensure it's stopped, because + * VMMs otherwise steal CPU time just to pointlessly waggle + * the (masked) IRQ. + */ + scoped_guard(irq) + clockevent_i8253_disable(); return false; - + } clockevent_i8253_init(true); global_clock_event = &i8253_clockevent; return true; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index c20d1832c481..2bade73f49e3 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -23,6 +23,7 @@ #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> +#include <asm/io_apic.h> /* * This is the 'legacy' 8259A Programmable Interrupt Controller, diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index fc37c8d83daf..f445bec516a0 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = { # endif INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt), INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt), +# ifdef CONFIG_X86_POSTED_MSI + INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification), +# endif #endif }; diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index e2fab3ceb09f..ff40f09ad911 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(struct task_struct *tsk) +static void task_update_io_bitmap(void) { + struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { @@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk) struct io_bitmap *iobm = tsk->thread.io_bitmap; tsk->thread.io_bitmap = NULL; - task_update_io_bitmap(tsk); + /* + * Don't touch the TSS when invoked on a failed fork(). TSS + * reflects the state of @current and not the state of @tsk. + */ + if (tsk == current) + task_update_io_bitmap(); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -144,7 +150,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) * Update the sequence number to force a TSS update on return to * user mode. */ - iobm->sequence = atomic64_add_return(1, &io_bitmap_sequence); + iobm->sequence = atomic64_inc_return(&io_bitmap_sequence); return 0; } @@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(current); - + task_update_io_bitmap(); return 0; } diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 35fde0107901..9ed29ff10e59 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,13 +22,22 @@ #include <asm/desc.h> #include <asm/traps.h> #include <asm/thermal.h> +#include <asm/posted_intr.h> +#include <asm/irq_remapping.h> +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR) #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> +#endif DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); +DEFINE_PER_CPU_CACHE_HOT(u16, __softirq_pending); +EXPORT_PER_CPU_SYMBOL(__softirq_pending); + +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, hardirq_stack_ptr); + atomic_t irq_err_count; /* @@ -182,6 +191,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_X86_POSTED_MSI + seq_printf(p, "%*s: ", prec, "PMN"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->posted_msi_notification_count); + seq_puts(p, " Posted MSI notification event\n"); +#endif return 0; } @@ -240,24 +256,16 @@ static __always_inline void handle_irq(struct irq_desc *desc, __handle_irq(desc, regs); } -/* - * common_interrupt() handles all normal device IRQ's (the special SMP - * cross-CPU interrupts have their own entry points). - */ -DEFINE_IDTENTRY_IRQ(common_interrupt) +static __always_inline int call_irq_handler(int vector, struct pt_regs *regs) { - struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc; - - /* entry code tells RCU that we're not quiescent. Check it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + int ret = 0; desc = __this_cpu_read(vector_irq[vector]); if (likely(!IS_ERR_OR_NULL(desc))) { handle_irq(desc, regs); } else { - apic_eoi(); - + ret = -EINVAL; if (desc == VECTOR_UNUSED) { pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n", __func__, smp_processor_id(), @@ -267,6 +275,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt) } } + return ret; +} + +/* + * common_interrupt() handles all normal device IRQ's (the special SMP + * cross-CPU interrupts have their own entry points). + */ +DEFINE_IDTENTRY_IRQ(common_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* entry code tells RCU that we're not quiescent. Check it. */ + RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU"); + + if (unlikely(call_irq_handler(vector, regs))) + apic_eoi(); + set_irq_regs(old_regs); } @@ -334,12 +359,96 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi) } #endif +#ifdef CONFIG_X86_POSTED_MSI + +/* Posted Interrupt Descriptors for coalesced MSIs to be posted */ +DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc); + +void intel_posted_msi_init(void) +{ + u32 destination; + u32 apic_id; + + this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR); + + /* + * APIC destination ID is stored in bit 8:15 while in XAPIC mode. + * VT-d spec. CH 9.11 + */ + apic_id = this_cpu_read(x86_cpu_to_apicid); + destination = x2apic_enabled() ? apic_id : apic_id << 8; + this_cpu_write(posted_msi_pi_desc.ndst, destination); +} + +static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) +{ + unsigned long pir_copy[NR_PIR_WORDS]; + int vec = FIRST_EXTERNAL_VECTOR; + + if (!pi_harvest_pir(pir, pir_copy)) + return false; + + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); + + return true; +} + +/* + * Performance data shows that 3 is good enough to harvest 90+% of the benefit + * on high IRQ rate workload. + */ +#define MAX_POSTED_MSI_COALESCING_LOOP 3 + +/* + * For MSIs that are delivered as posted interrupts, the CPU notifications + * can be coalesced if the MSIs arrive in high frequency bursts. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct pi_desc *pid; + int i = 0; + + pid = this_cpu_ptr(&posted_msi_pi_desc); + + inc_irq_stat(posted_msi_notification_count); + irq_enter(); + + /* + * Max coalescing count includes the extra round of handle_pending_pir + * after clearing the outstanding notification bit. Hence, at most + * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. + */ + while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { + if (!handle_pending_pir(pid->pir, regs)) + break; + } + + /* + * Clear outstanding notification bit to allow new IRQ notifications, + * do this last to maximize the window of interrupt coalescing. + */ + pi_clear_on(pid); + + /* + * There could be a race of PI notification and the clearing of ON bit, + * process PIR bits one last time such that handling the new interrupts + * are not delayed until the next IRQ. + */ + handle_pending_pir(pid->pir, regs); + + apic_eoi(); + irq_exit(); + set_irq_regs(old_regs); +} +#endif /* X86_POSTED_MSI */ #ifdef CONFIG_HOTPLUG_CPU /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ void fixup_irqs(void) { - unsigned int irr, vector; + unsigned int vector; struct irq_desc *desc; struct irq_data *data; struct irq_chip *chip; @@ -366,8 +475,7 @@ void fixup_irqs(void) if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector]))) continue; - irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); - if (irr & (1 << (vector % 32))) { + if (is_vector_pending(vector)) { desc = __this_cpu_read(vector_irq[vector]); raw_spin_lock(&desc->lock); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index dc1049c01f9b..c7a5d2960d57 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -29,12 +29,9 @@ int sysctl_panic_on_stackoverflow __read_mostly; /* Debugging check for stack overflow: is there less than 1KB free? */ -static int check_stack_overflow(void) +static bool check_stack_overflow(void) { - long sp; - - __asm__ __volatile__("andl %%esp,%0" : - "=r" (sp) : "0" (THREAD_SIZE - 1)); + unsigned long sp = current_stack_pointer & (THREAD_SIZE - 1); return sp < (sizeof(struct thread_info) + STACK_WARN); } @@ -48,18 +45,19 @@ static void print_stack_overflow(void) } #else -static inline int check_stack_overflow(void) { return 0; } +static inline bool check_stack_overflow(void) { return false; } static inline void print_stack_overflow(void) { } #endif +DEFINE_PER_CPU_CACHE_HOT(struct irq_stack *, softirq_stack_ptr); + static void call_on_stack(void *func, void *stack) { - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=b" (stack) - : "0" (stack), - [thunk_target] "D"(func) + "movl %[sp], %%esp" + : [sp] "+b" (stack) + : [thunk_target] "D" (func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -68,13 +66,13 @@ static inline void *current_stack(void) return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); } -static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) +static inline bool execute_on_irq_stack(bool overflow, struct irq_desc *desc) { struct irq_stack *curstk, *irqstk; - u32 *isp, *prev_esp, arg1; + u32 *isp, *prev_esp; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(pcpu_hot.hardirq_stack_ptr); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -83,7 +81,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) * current stack (which is the irq stack already after all) */ if (unlikely(curstk == irqstk)) - return 0; + return false; isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -94,14 +92,13 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) if (unlikely(overflow)) call_on_stack(print_stack_overflow, isp); - asm volatile("xchgl %%ebx,%%esp \n" + asm volatile("xchgl %[sp], %%esp\n" CALL_NOSPEC - "movl %%ebx,%%esp \n" - : "=a" (arg1), "=b" (isp) - : "0" (desc), "1" (isp), - [thunk_target] "D" (desc->handle_irq) - : "memory", "cc", "ecx"); - return 1; + "movl %[sp], %%esp" + : "+a" (desc), [sp] "+b" (isp) + : [thunk_target] "D" (desc->handle_irq) + : "memory", "cc", "edx", "ecx"); + return true; } /* @@ -112,7 +109,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) int node = cpu_to_node(cpu); struct page *ph, *ps; - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); @@ -124,8 +121,8 @@ int irq_init_percpu_irqstack(unsigned int cpu) return -ENOMEM; } - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = page_address(ph); - per_cpu(pcpu_hot.softirq_stack_ptr, cpu) = page_address(ps); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); return 0; } @@ -135,7 +132,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(pcpu_hot.softirq_stack_ptr); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); @@ -150,7 +147,7 @@ void do_softirq_own_stack(void) void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { - int overflow = check_stack_overflow(); + bool overflow = check_stack_overflow(); if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) { if (unlikely(overflow)) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index fe0c859873d1..ca78dce39361 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,6 +18,7 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> +#include <linux/vmalloc.h> #include <asm/cpu_entry_area.h> #include <asm/softirq_stack.h> @@ -25,8 +26,8 @@ #include <asm/io_apic.h> #include <asm/apic.h> +DEFINE_PER_CPU_CACHE_HOT(bool, hardirq_stack_inuse); DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; -DECLARE_INIT_PER_CPU(irq_stack_backing_store); #ifdef CONFIG_VMAP_STACK /* @@ -50,7 +51,7 @@ static int map_irq_stack(unsigned int cpu) return -ENOMEM; /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #else @@ -63,14 +64,14 @@ static int map_irq_stack(unsigned int cpu) void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); /* Store actual TOS to avoid adjustment in the hotpath */ - per_cpu(pcpu_hot.hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8; return 0; } #endif int irq_init_percpu_irqstack(unsigned int cpu) { - if (per_cpu(pcpu_hot.hardirq_stack_ptr, cpu)) + if (per_cpu(hardirq_stack_ptr, cpu)) return 0; return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqflags.S b/arch/x86/kernel/irqflags.S index 7f542a7799cb..fdabd5dda154 100644 --- a/arch/x86/kernel/irqflags.S +++ b/arch/x86/kernel/irqflags.S @@ -9,6 +9,7 @@ */ .pushsection .noinstr.text, "ax" SYM_FUNC_START(native_save_fl) + ENDBR pushf pop %_ASM_AX RET diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 9a7c03d47861..9cea1fc36c18 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/cpuset.h> +#include <linux/debugfs.h> #include <linux/mutex.h> #include <linux/sysctl.h> #include <linux/nodemask.h> @@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable; * of higher turbo frequency for cpus supporting Intel Turbo Boost Max * Technology 3.0. * - * It can be set via /proc/sys/kernel/sched_itmt_enabled + * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled */ -unsigned int __read_mostly sysctl_sched_itmt_enabled; +bool __read_mostly sysctl_sched_itmt_enabled; -static int sched_itmt_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static ssize_t sched_itmt_enabled_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) { - unsigned int old_sysctl; - int ret; + ssize_t result; + bool orig; - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); - return -EINVAL; - } - - old_sysctl = sysctl_sched_itmt_enabled; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + orig = sysctl_sched_itmt_enabled; + result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); - if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + if (sysctl_sched_itmt_enabled != orig) { x86_topology_update = true; rebuild_sched_domains(); } - mutex_unlock(&itmt_update_mutex); - - return ret; + return result; } -static struct ctl_table itmt_kern_table[] = { - { - .procname = "sched_itmt_enabled", - .data = &sysctl_sched_itmt_enabled, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_itmt_update_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, +static const struct file_operations dfs_sched_itmt_fops = { + .read = debugfs_read_file_bool, + .write = sched_itmt_enabled_write, + .open = simple_open, + .llseek = default_llseek, }; -static struct ctl_table_header *itmt_sysctl_header; +static struct dentry *dfs_sched_itmt; /** * sched_set_itmt_support() - Indicate platform supports ITMT @@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header; */ int sched_set_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (sched_itmt_capable) return 0; - } - itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); - if (!itmt_sysctl_header) { - mutex_unlock(&itmt_update_mutex); + dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled", + 0644, + arch_debugfs_dir, + &sysctl_sched_itmt_enabled, + &dfs_sched_itmt_fops); + if (IS_ERR_OR_NULL(dfs_sched_itmt)) { + dfs_sched_itmt = NULL; return -ENOMEM; } @@ -117,8 +109,6 @@ int sched_set_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); - mutex_unlock(&itmt_update_mutex); - return 0; } @@ -134,18 +124,15 @@ int sched_set_itmt_support(void) */ void sched_clear_itmt_support(void) { - mutex_lock(&itmt_update_mutex); + guard(mutex)(&itmt_update_mutex); - if (!sched_itmt_capable) { - mutex_unlock(&itmt_update_mutex); + if (!sched_itmt_capable) return; - } + sched_itmt_capable = false; - if (itmt_sysctl_header) { - unregister_sysctl_table(itmt_sysctl_header); - itmt_sysctl_header = NULL; - } + debugfs_remove(dfs_sched_itmt); + dfs_sched_itmt = NULL; if (sysctl_sched_itmt_enabled) { /* disable sched_itmt if we are no longer ITMT capable */ @@ -153,8 +140,6 @@ void sched_clear_itmt_support(void) x86_topology_update = true; rebuild_sched_domains(); } - - mutex_unlock(&itmt_update_mutex); } int arch_asym_cpu_priority(int cpu) diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index df337860612d..9e9a591a5fec 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -12,6 +12,7 @@ #include <linux/kernel.h> #include <linux/reboot.h> #include <linux/serial_8250.h> +#include <linux/acpi.h> #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/acpi.h> @@ -48,7 +49,7 @@ static uint32_t jailhouse_cpuid_base(void) !boot_cpu_has(X86_FEATURE_HYPERVISOR)) return 0; - return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); + return cpuid_base_hypervisor("Jailhouse\0\0\0", 0); } static uint32_t __init jailhouse_detect(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 68530fad05f7..24a41f0e0cf1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -27,6 +27,8 @@ #include <asm/kexec-bzimage64.h> #define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ +#define MAX_DMCRYPTKEYS_STR_LEN 31 /* dmcryptkeys=0x<64bit-value> */ + /* * Defines lowest physical address for various segments. Not sure where @@ -76,6 +78,10 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params, if (image->type == KEXEC_TYPE_CRASH) { len = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", image->elf_load_addr); + + if (image->dm_crypt_keys_addr != 0) + len += sprintf(cmdline_ptr + len, + "dmcryptkeys=0x%lx ", image->dm_crypt_keys_addr); } memcpy(cmdline_ptr + len, cmdline, cmdline_len); cmdline_len += len; @@ -233,6 +239,32 @@ setup_ima_state(const struct kimage *image, struct boot_params *params, #endif /* CONFIG_IMA_KEXEC */ } +static void setup_kho(const struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int setup_data_offset) +{ + struct setup_data *sd = (void *)params + setup_data_offset; + struct kho_data *kho = (void *)sd + sizeof(*sd); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + return; + + sd->type = SETUP_KEXEC_KHO; + sd->len = sizeof(struct kho_data); + + /* Only add if we have all KHO images in place */ + if (!image->kho.fdt || !image->kho.scratch) + return; + + /* Add setup data */ + kho->fdt_addr = image->kho.fdt; + kho->fdt_size = PAGE_SIZE; + kho->scratch_addr = image->kho.scratch->mem; + kho->scratch_size = image->kho.scratch->bufsz; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = params_load_addr + setup_data_offset; +} + static int setup_boot_parameters(struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -312,6 +344,13 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct ima_setup_data); } + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + /* Setup space to store preservation metadata */ + setup_kho(image, params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + sizeof(struct kho_data); + } + /* Setup RNG seed */ setup_rng_seed(params, params_load_addr, setup_data_offset); @@ -441,6 +480,19 @@ static void *bzImage64_load(struct kimage *image, char *kernel, ret = crash_load_segments(image); if (ret) return ERR_PTR(ret); + ret = crash_load_dm_crypt_keys(image); + if (ret == -ENOENT) { + kexec_dprintk("No dm crypt key to load\n"); + } else if (ret) { + pr_err("Failed to load dm crypt keys\n"); + return ERR_PTR(ret); + } + if (image->dm_crypt_keys_addr && + cmdline_len + MAX_ELFCOREHDR_STR_LEN + MAX_DMCRYPTKEYS_STR_LEN > + header->cmdline_size) { + pr_err("Appending dmcryptkeys=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } } #endif @@ -468,6 +520,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel, efi_map_sz = efi_get_runtime_map_size(); params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + MAX_ELFCOREHDR_STR_LEN; + if (image->dm_crypt_keys_addr) + params_cmdline_sz += MAX_DMCRYPTKEYS_STR_LEN; params_cmdline_sz = ALIGN(params_cmdline_sz, 16); kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) + sizeof(struct setup_data) + @@ -479,6 +533,10 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); + if (IS_ENABLED(CONFIG_KEXEC_HANDOVER)) + kbuf.bufsz += sizeof(struct setup_data) + + sizeof(struct kho_data); + params = kzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 9c9faa1634fb..8b1a9733d13e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -385,7 +385,7 @@ static void kgdb_disable_hw_debug(struct pt_regs *regs) struct perf_event *bp; /* Disable hardware debugging while we are in kgdb: */ - set_debugreg(0UL, 7); + set_debugreg(DR7_FIXED_1, 7); for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; @@ -655,7 +655,7 @@ void kgdb_arch_late(void) if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); - if (IS_ERR((void * __force)breakinfo[i].pev)) { + if (IS_ERR_PCPU(breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d0e49bd7c6f3..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -40,12 +40,12 @@ #include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/kasan.h> -#include <linux/moduleloader.h> #include <linux/objtool.h> #include <linux/vmalloc.h> #include <linux/pgtable.h> #include <linux/set_memory.h> #include <linux/cfi.h> +#include <linux/execmem.h> #include <asm/text-patching.h> #include <asm/cacheflush.h> @@ -373,16 +373,7 @@ out: kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - u32 insn; - - /* - * Since 'addr' is not guaranteed to be safe to access, use - * copy_from_kernel_nofault() to read the instruction: - */ - if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) - return NULL; - - if (is_endbr(insn)) { + if (is_endbr((u32 *)addr)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; @@ -495,7 +486,7 @@ void *alloc_insn_page(void) { void *page; - page = module_alloc(PAGE_SIZE); + page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); if (!page) return NULL; @@ -817,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -827,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb77..2be55ec3f392 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -9,6 +9,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/ftrace.h> +#include <asm/text-patching.h> #include "common.h" @@ -21,6 +22,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; @@ -33,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (kprobe_running()) { kprobes_inc_nmissed_count(p); } else { - unsigned long orig_ip = regs->ip; + unsigned long orig_ip = instruction_pointer(regs); + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ - regs->ip = ip + sizeof(kprobe_opcode_t); + instruction_pointer_set(regs, ip + INT3_INSN_SIZE); __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; if (unlikely(p->post_handler)) { + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE); kcb->kprobe_status = KPROBE_HIT_SSDONE; p->post_handler(p, regs, 0); } - regs->ip = orig_ip; + /* Recover IP address */ + instruction_pointer_set(regs, orig_ip); } /* * If pre_handler returns !0, it changes regs->ip. We have to diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 257892fcefa7..b68d4be9464e 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj, static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { memcpy(buf, (void *)&boot_params + off, count); return count; } -static struct bin_attribute boot_params_data_attr = { +static const struct bin_attribute boot_params_data_attr = { .attr = { .name = "data", .mode = S_IRUGO, }, - .read = boot_params_data_read, + .read_new = boot_params_data_read, .size = sizeof(boot_params), }; @@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = { NULL, }; -static struct bin_attribute *boot_params_data_attrs[] = { +static const struct bin_attribute *const boot_params_data_attrs[] = { &boot_params_data_attr, NULL, }; static const struct attribute_group boot_params_attr_group = { .attrs = boot_params_version_attrs, - .bin_attrs = boot_params_data_attrs, + .bin_attrs_new = boot_params_data_attrs, }; static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) @@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj, static ssize_t setup_data_data_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { @@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = { .name = "data", .mode = S_IRUGO, }, - .read = setup_data_data_read, + .read_new = setup_data_data_read, }; static struct attribute *setup_data_type_attrs[] = { @@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = { NULL, }; -static struct bin_attribute *setup_data_data_attrs[] = { +static const struct bin_attribute *const setup_data_data_attrs[] = { &data_attr, NULL, }; static const struct attribute_group setup_data_attr_group = { .attrs = setup_data_type_attrs, - .bin_attrs = setup_data_data_attrs, + .bin_attrs_new = setup_data_data_attrs, }; static int __init create_setup_data_node(struct kobject *parent, diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4cadfd606e8e..921c1c783bc1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -37,14 +37,16 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/msr.h> #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> #include <asm/e820/api.h> -DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); +DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled); static int kvmapf = 1; @@ -65,6 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); +static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; @@ -244,7 +247,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void) { u32 flags = 0; - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { flags = __this_cpu_read(apf_reason.flags); __this_cpu_write(apf_reason.flags, 0); } @@ -295,11 +298,11 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) inc_irq_stat(irq_hv_callback_count); - if (__this_cpu_read(apf_reason.enabled)) { + if (__this_cpu_read(async_pf_enabled)) { token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); - wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1); } set_irq_regs(old_regs); @@ -325,7 +328,7 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } @@ -359,10 +362,10 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; - wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); - __this_cpu_write(apf_reason.enabled, 1); + wrmsrq(MSR_KVM_ASYNC_PF_EN, pa); + __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -374,7 +377,7 @@ static void kvm_guest_cpu_init(void) __this_cpu_write(kvm_apic_eoi, 0); pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) | KVM_MSR_ENABLED; - wrmsrl(MSR_KVM_PV_EOI_EN, pa); + wrmsrq(MSR_KVM_PV_EOI_EN, pa); } if (has_steal_clock) @@ -383,11 +386,11 @@ static void kvm_guest_cpu_init(void) static void kvm_pv_disable_apf(void) { - if (!__this_cpu_read(apf_reason.enabled)) + if (!__this_cpu_read(async_pf_enabled)) return; - wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); - __this_cpu_write(apf_reason.enabled, 0); + wrmsrq(MSR_KVM_ASYNC_PF_EN, 0); + __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); } @@ -397,7 +400,7 @@ static void kvm_disable_steal_time(void) if (!has_steal_clock) return; - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); + wrmsrq(MSR_KVM_STEAL_TIME, 0); } static u64 kvm_steal_clock(int cpu) @@ -449,9 +452,9 @@ static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); + wrmsrq(MSR_KVM_PV_EOI_EN, 0); if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -613,7 +616,7 @@ static int __init setup_efi_kvm_sev_migration(void) } pr_info("%s : live migration enabled in EFI\n", __func__); - wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); return 1; } @@ -726,7 +729,7 @@ static int kvm_suspend(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) - rdmsrl(MSR_KVM_POLL_CONTROL, val); + rdmsrq(MSR_KVM_POLL_CONTROL, val); has_guest_poll = !(val & 1); #endif return 0; @@ -738,7 +741,7 @@ static void kvm_resume(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); #endif } @@ -836,7 +839,6 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } @@ -873,7 +875,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base(KVM_SIGNATURE, 0); + return cpuid_base_hypervisor(KVM_SIGNATURE, 0); return 0; } @@ -974,11 +976,14 @@ static void __init kvm_init_platform(void) * If not booted using EFI, enable Live migration support. */ if (!efi_enabled(EFI_BOOT)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; + + /* Set WB as the default cache mode for SEV-SNP and TDX */ + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } #if defined(CONFIG_AMD_MEM_ENCRYPT) @@ -1120,12 +1125,12 @@ out: static void kvm_disable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); } static void kvm_enable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 1); + wrmsrq(MSR_KVM_POLL_CONTROL, 1); } void arch_haltpoll_enable(unsigned int cpu) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b2c15214a6b..ca0a49eeac4a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -60,7 +60,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); */ static void kvm_get_wallclock(struct timespec64 *now) { - wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); preempt_disable(); pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); preempt_enable(); @@ -173,7 +173,7 @@ static void kvm_register_clock(char *txt) return; pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; - wrmsrl(msr_kvm_system_time, pa); + wrmsrq(msr_kvm_system_time, pa); pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } @@ -196,7 +196,7 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { if (msr_kvm_system_time) - native_write_msr(msr_kvm_system_time, 0, 0); + native_write_msr(msr_kvm_system_time, 0); } static void __init kvmclock_init_mem(void) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 1b373d79cedc..1f325304c4a8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -42,7 +42,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)image->arch.pgd, pgd_allocation_order()); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PGD_ALLOCATION_ORDER); + pgd_allocation_order()); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); @@ -160,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image) */ void machine_kexec(struct kimage *image) { + relocate_kernel_fn *relocate_kernel_ptr; unsigned long page_list[PAGES_NR]; void *control_page; int save_ftrace_enabled; - asmlinkage unsigned long - (*relocate_kernel_ptr)(unsigned long indirection_page, - unsigned long control_page, - unsigned long start_address, - unsigned int has_pae, - unsigned int preserve_context); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b180d8e497c3..697fb99406e6 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -28,6 +28,7 @@ #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/cpu.h> +#include <asm/efi.h> #ifdef CONFIG_ACPI /* @@ -75,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif +static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p) +{ + unsigned long mstart, mend; + + if (!kexec_debug_8250_mmio32) + return 0; + + mstart = kexec_debug_8250_mmio32 & PAGE_MASK; + mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK; + pr_info("Map PCI serial at %lx - %lx\n", mstart, mend); + return kernel_ident_mapping_init(info, level4p, mstart, mend); +} + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -87,6 +101,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; + void *kaddr; + int ret; if (!efi_enabled(EFI_BOOT)) return 0; @@ -102,6 +118,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) if (!mstart) return 0; + ret = kernel_ident_mapping_init(info, level4p, mstart, mend); + if (ret) + return ret; + + kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB); + if (!kaddr) { + pr_err("Could not map UEFI system table\n"); + return -ENOMEM; + } + + mstart = efi_config_table; + + if (efi_enabled(EFI_64BIT)) { + efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables; + } else { + efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr; + + mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables; + } + + memunmap(kaddr); + return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; @@ -119,7 +159,8 @@ static void free_transition_pgtable(struct kimage *image) image->arch.pte = NULL; } -static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd, + unsigned long control_page) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; @@ -129,8 +170,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) pmd_t *pmd; pte_t *pte; - vaddr = (unsigned long)relocate_kernel; - paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + /* + * For the transition to the identity mapped page tables, the control + * code page also needs to be mapped at the virtual address it starts + * off running from. + */ + vaddr = (unsigned long)__va(control_page); + paddr = control_page; pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); @@ -189,7 +235,7 @@ static void *alloc_pgt_page(void *data) return p; } -static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +static int init_pgtable(struct kimage *image, unsigned long control_page) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, @@ -198,12 +244,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; - pgd_t *level4p; int result; int i; - level4p = (pgd_t *)__va(start_pgtable); - clear_page(level4p); + image->arch.pgd = alloc_pgt_page(image); + if (!image->arch.pgd) + return -ENOMEM; if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; @@ -217,8 +263,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; } @@ -233,8 +279,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - result = kernel_ident_mapping_init(&info, - level4p, mstart, mend); + result = kernel_ident_mapping_init(&info, image->arch.pgd, + mstart, mend); if (result) return result; @@ -244,15 +290,23 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ - result = map_efi_systab(&info, level4p); + result = map_efi_systab(&info, image->arch.pgd); if (result) return result; - result = map_acpi_tables(&info, level4p); + result = map_acpi_tables(&info, image->arch.pgd); if (result) return result; - return init_transition_pgtable(image, level4p); + result = map_mmio_serial(&info, image->arch.pgd); + if (result) + return result; + + /* + * This must be last because the intermediate page table pages it + * allocates will not be control pages and may overlap the image. + */ + return init_transition_pgtable(image, image->arch.pgd, control_page); } static void load_segments(void) @@ -267,24 +321,58 @@ static void load_segments(void) ); } +static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs) +{ + gate_desc idtentry = { 0 }; + int i; + + idtentry.bits.p = 1; + idtentry.bits.type = GATE_TRAP; + idtentry.segment = __KERNEL_CS; + idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs; + idtentry.offset_middle = (control_page >> 16) & 0xFFFF; + idtentry.offset_high = control_page >> 32; + + for (i = 0; i < 16; i++) { + kexec_debug_idt[i] = idtentry; + idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE; + } +} + int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable; + void *control_page = page_address(image->control_code_page); + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + unsigned long reloc_end = (unsigned long)__relocate_kernel_end; int result; - /* Calculate the offsets */ - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - /* Setup the identity mapped 64bit page table */ - result = init_pgtable(image, start_pgtable); + result = init_pgtable(image, __pa(control_page)); if (result) return result; + kexec_va_control_page = (unsigned long)control_page; + kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + + prepare_debug_idt((unsigned long)__pa(control_page), + (unsigned long)kexec_debug_exc_vectors - reloc_start); + + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); + + set_memory_rox((unsigned long)control_page, 1); return 0; } void machine_kexec_cleanup(struct kimage *image) { + void *control_page = page_address(image->control_code_page); + + set_memory_nx((unsigned long)control_page, 1); + set_memory_rw((unsigned long)control_page, 1); + free_transition_pgtable(image); } @@ -292,11 +380,19 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -void machine_kexec(struct kimage *image) +void __nocfi machine_kexec(struct kimage *image) { - unsigned long page_list[PAGES_NR]; - void *control_page; + unsigned long reloc_start = (unsigned long)__relocate_kernel_start; + relocate_kernel_fn *relocate_kernel_ptr; + unsigned int host_mem_enc_active; int save_ftrace_enabled; + void *control_page; + + /* + * This must be done before load_segments() since if call depth tracking + * is used then GS must be valid to make any function calls. + */ + host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -323,17 +419,13 @@ void machine_kexec(struct kimage *image) #endif } - control_page = page_address(image->control_code_page) + PAGE_SIZE; - __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + control_page = page_address(image->control_code_page); - page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; - page_list[PA_TABLE_PAGE] = - (unsigned long)__pa(page_address(image->control_code_page)); - - if (image->type == KEXEC_TYPE_DEFAULT) - page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) - << PAGE_SHIFT); + /* + * Allow for the possibility that relocate_kernel might not be at + * the very start of the page. + */ + relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start; /* * The segment registers are funny things, they have both a @@ -342,23 +434,17 @@ void machine_kexec(struct kimage *image) * with from a table in memory. At no other time is the * descriptor table in memory accessed. * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. + * Take advantage of this here by force loading the segments, + * before the GDT is zapped with an invalid value. */ load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - native_idt_invalidate(); - native_gdt_invalidate(); /* now call it */ - image->start = relocate_kernel((unsigned long)image->head, - (unsigned long)page_list, - image->start, - image->preserve_context, - cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)); + image->start = relocate_kernel_ptr((unsigned long)image->head, + virt_to_phys(control_page), + image->start, + image->preserve_context, + host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) @@ -539,19 +625,40 @@ static void kexec_mark_crashkres(bool protect) /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); - /* Control code page is located in the 2nd page. */ - kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); + kexec_mark_range(crashk_res.start, control - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } +/* make the memory storing dm crypt keys in/accessible */ +static void kexec_mark_dm_crypt_keys(bool protect) +{ + unsigned long start_paddr, end_paddr; + unsigned int nr_pages; + + if (kexec_crash_image->dm_crypt_keys_addr) { + start_paddr = kexec_crash_image->dm_crypt_keys_addr; + end_paddr = start_paddr + kexec_crash_image->dm_crypt_keys_sz - 1; + nr_pages = (PAGE_ALIGN(end_paddr) - PAGE_ALIGN_DOWN(start_paddr))/PAGE_SIZE; + if (protect) + set_memory_np((unsigned long)phys_to_virt(start_paddr), nr_pages); + else + __set_memory_prot( + (unsigned long)phys_to_virt(start_paddr), + nr_pages, + __pgprot(_PAGE_PRESENT | _PAGE_NX | _PAGE_RW)); + } +} + void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); + kexec_mark_dm_crypt_keys(true); } void arch_kexec_unprotect_crashkres(void) { + kexec_mark_dm_crypt_keys(false); kexec_mark_crashkres(false); } #endif diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index c94dec6a1834..ef6104e7cc72 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -9,6 +9,7 @@ #include <linux/pci.h> #include <linux/dmi.h> #include <linux/range.h> +#include <linux/acpi.h> #include <asm/pci-direct.h> #include <linux/sort.h> @@ -96,7 +97,7 @@ static void get_fam10h_pci_mmconf_base(void) /* SYS_CFG */ address = MSR_AMD64_SYSCFG; - rdmsrl(address, val); + rdmsrq(address, val); /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { @@ -104,7 +105,7 @@ static void get_fam10h_pci_mmconf_base(void) } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); + rdmsrq(address, val); tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } @@ -176,7 +177,7 @@ void fam10h_check_enable_mmcfg(void) return; address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, val); + rdmsrq(address, val); /* try to make sure that AP's setting is identical to BSP setting */ if (val & FAM10H_MMIO_CONF_ENABLE) { @@ -211,7 +212,7 @@ void fam10h_check_enable_mmcfg(void) (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)); val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; - wrmsrl(address, val); + wrmsrq(address, val); } static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e18914c0e38a..0ffbae902e2f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -19,6 +19,7 @@ #include <linux/jump_label.h> #include <linux/random.h> #include <linux/memory.h> +#include <linux/stackprotector.h> #include <asm/text-patching.h> #include <asm/page.h> @@ -36,57 +37,6 @@ do { \ } while (0) #endif -#ifdef CONFIG_RANDOMIZE_BASE -static unsigned long module_load_offset; - -/* Mutex protects the module_load_offset. */ -static DEFINE_MUTEX(module_kaslr_mutex); - -static unsigned long int get_module_load_offset(void) -{ - if (kaslr_enabled()) { - mutex_lock(&module_kaslr_mutex); - /* - * Calculate the module_load_offset the first time this - * code is called. Once calculated it stays the same until - * reboot. - */ - if (module_load_offset == 0) - module_load_offset = - get_random_u32_inclusive(1, 1024) * PAGE_SIZE; - mutex_unlock(&module_kaslr_mutex); - } - return module_load_offset; -} -#else -static unsigned long int get_module_load_offset(void) -{ - return 0; -} -#endif - -void *module_alloc(unsigned long size) -{ - gfp_t gfp_mask = GFP_KERNEL; - void *p; - - if (PAGE_ALIGN(size) > MODULES_LEN) - return NULL; - - p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, PAGE_KERNEL, - VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, - NUMA_NO_NODE, __builtin_return_address(0)); - - if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { - vfree(p); - return NULL; - } - - return p; -} - #ifdef CONFIG_X86_32 int apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, @@ -181,6 +131,20 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, goto overflow; size = 4; break; +#if defined(CONFIG_STACKPROTECTOR) && \ + defined(CONFIG_CC_IS_CLANG) && CONFIG_CLANG_VERSION < 170000 + case R_X86_64_REX_GOTPCRELX: { + static unsigned long __percpu *const addr = &__stack_chk_guard; + + if (sym->st_value != (u64)addr) { + pr_err("%s: Unsupported GOTPCREL relocation\n", me->name); + return -ENOEXEC; + } + + val = (u64)&addr + rel[i].r_addend; + fallthrough; + } +#endif case R_X86_64_PC32: case R_X86_64_PLT32: val -= (u64)loc; @@ -242,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } @@ -302,6 +266,8 @@ int module_finalize(const Elf_Ehdr *hdr, ibt_endbr = s; } + its_init_mod(me); + if (retpolines || cfi) { void *rseg = NULL, *cseg = NULL; unsigned int rsize = 0, csize = 0; @@ -322,30 +288,26 @@ int module_finalize(const Elf_Ehdr *hdr, void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + + its_fini_mod(me); + if (returns) { void *rseg = (void *)returns->sh_addr; apply_returns(rseg, rseg + returns->sh_size); } - if (alt) { - /* patch .altinstructions */ - void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); - } - if (calls || alt) { + if (calls) { struct callthunk_sites cs = {}; - if (calls) { - cs.call_start = (void *)calls->sh_addr; - cs.call_end = (void *)calls->sh_addr + calls->sh_size; - } - - if (alt) { - cs.alt_start = (void *)alt->sh_addr; - cs.alt_end = (void *)alt->sh_addr + alt->sh_size; - } + cs.call_start = (void *)calls->sh_addr; + cs.call_end = (void *)calls->sh_addr + calls->sh_size; callthunks_patch_module_calls(&cs, me); } + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); @@ -369,4 +331,5 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + its_free_mod(mod); } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e89171b0347a..4a1b1b28abf9 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -68,7 +68,7 @@ static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str) { memcpy(str, m->bustype, 6); str[6] = 0; - apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); + apic_pr_verbose("Bus #%d is %s\n", m->busid, str); } static void __init MP_bus_info(struct mpc_bus *m) @@ -417,7 +417,7 @@ static unsigned long __init get_mpc_size(unsigned long physptr) mpc = early_memremap(physptr, PAGE_SIZE); size = mpc->length; early_memunmap(mpc, PAGE_SIZE); - apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size); return size; } @@ -560,8 +560,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) struct mpf_intel *mpf; int ret = 0; - apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", - base, base + length - 1); + apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { @@ -683,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) { int i; - apic_printk(APIC_VERBOSE, "OLD "); + apic_pr_verbose("OLD "); print_mp_irq_info(m); i = get_MP_intsrc_index(m); if (i > 0) { memcpy(m, &mp_irqs[i], sizeof(*m)); - apic_printk(APIC_VERBOSE, "NEW "); + apic_pr_verbose("NEW "); print_mp_irq_info(&mp_irqs[i]); return; } @@ -772,7 +771,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, continue; if (nr_m_spare > 0) { - apic_printk(APIC_VERBOSE, "*NEW* found\n"); + apic_pr_verbose("*NEW* found\n"); nr_m_spare--; memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); m_spare[nr_m_spare] = NULL; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a5b372c706f..be93ec7255bf 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -40,32 +40,29 @@ #define CREATE_TRACE_POINTS #include <trace/events/nmi.h> +/* + * An emergency handler can be set in any context including NMI + */ struct nmi_desc { raw_spinlock_t lock; + nmi_handler_t emerg_handler; struct list_head head; }; -static struct nmi_desc nmi_desc[NMI_MAX] = -{ - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), - .head = LIST_HEAD_INIT(nmi_desc[0].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), - .head = LIST_HEAD_INIT(nmi_desc[1].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), - .head = LIST_HEAD_INIT(nmi_desc[2].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), - .head = LIST_HEAD_INIT(nmi_desc[3].head), - }, +#define NMI_DESC_INIT(type) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \ + .head = LIST_HEAD_INIT(nmi_desc[type].head), \ +} +static struct nmi_desc nmi_desc[NMI_MAX] = { + NMI_DESC_INIT(NMI_LOCAL), + NMI_DESC_INIT(NMI_UNKNOWN), + NMI_DESC_INIT(NMI_SERR), + NMI_DESC_INIT(NMI_IO_CHECK), }; +#define nmi_to_desc(type) (&nmi_desc[type]) + struct nmi_stats { unsigned int normal; unsigned int unknown; @@ -87,6 +84,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static int ignore_nmis __read_mostly; int unknown_nmi_panic; +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; + /* * Prevent NMI reason port (0x61) being accessed simultaneously, can * only be used in NMI handler. @@ -100,8 +100,6 @@ static int __init setup_unknown_nmi_panic(char *str) } __setup("unknown_nmi_panic", setup_unknown_nmi_panic); -#define nmi_to_desc(type) (&nmi_desc[type]) - static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; static int __init nmi_warning_debugfs(void) @@ -121,20 +119,33 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(duration, (1000 * 1000)); - decimal_msecs = remainder_ns / 1000; + /* Convert duration from nsec to msec */ + remainder_ns = do_div(duration, NSEC_PER_MSEC); + decimal_msecs = remainder_ns / NSEC_PER_USEC; - printk_ratelimited(KERN_INFO - "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, duration, decimal_msecs); + pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); + nmi_handler_t ehandler; struct nmiaction *a; int handled=0; + /* + * Call the emergency handler, if set + * + * In the case of crash_nmi_callback() emergency handler, it will + * return in the case of the crashing CPU to enable it to complete + * other necessary crashing actions ASAP. Other handlers in the + * linked list won't need to be run. + */ + ehandler = desc->emerg_handler; + if (ehandler) + return ehandler(type, regs); + rcu_read_lock(); /* @@ -224,6 +235,31 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); +/** + * set_emergency_nmi_handler - Set emergency handler + * @type: NMI type + * @handler: the emergency handler to be stored + * + * Set an emergency NMI handler which, if set, will preempt all the other + * handlers in the linked list. If a NULL handler is passed in, it will clear + * it. It is expected that concurrent calls to this function will not happen + * or the system is screwed beyond repair. + */ +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler) +{ + struct nmi_desc *desc = nmi_to_desc(type); + + if (WARN_ON_ONCE(desc->emerg_handler == handler)) + return; + desc->emerg_handler = handler; + + /* + * Ensure the emergency handler is visible to other CPUs before + * function return + */ + smp_wmb(); +} + static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { @@ -291,10 +327,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) int handled; /* - * Use 'false' as back-to-back NMIs are dealt with one level up. - * Of course this makes having multiple 'unknown' handlers useless - * as only the first one is ever run (unless it can actually determine - * if it caused the NMI) + * As a last resort, let the "unknown" handlers make a + * best-effort attempt to figure out if they can claim + * responsibility for this Unknown NMI. */ handled = nmi_handle(NMI_UNKNOWN, regs); if (handled) { @@ -324,17 +359,18 @@ static noinstr void default_do_nmi(struct pt_regs *regs) bool b2b = false; /* - * CPU-specific NMI must be processed before non-CPU-specific - * NMI, otherwise we may lose it, because the CPU-specific - * NMI can not be detected/processed on other CPUs. - */ - - /* - * Back-to-back NMIs are interesting because they can either - * be two NMI or more than two NMIs (any thing over two is dropped - * due to NMI being edge-triggered). If this is the second half - * of the back-to-back NMI, assume we dropped things and process - * more handlers. Otherwise reset the 'swallow' NMI behaviour + * Back-to-back NMIs are detected by comparing the RIP of the + * current NMI with that of the previous NMI. If it is the same, + * it is assumed that the CPU did not have a chance to jump back + * into a non-NMI context and execute code in between the two + * NMIs. + * + * They are interesting because even if there are more than two, + * only a maximum of two can be detected (anything over two is + * dropped due to NMI being edge-triggered). If this is the + * second half of the back-to-back NMI, assume we dropped things + * and process more handlers. Otherwise, reset the 'swallow' NMI + * behavior. */ if (regs->ip == __this_cpu_read(last_nmi_rip)) b2b = true; @@ -348,6 +384,11 @@ static noinstr void default_do_nmi(struct pt_regs *regs) if (microcode_nmi_handler_enabled() && microcode_nmi_handler()) goto out; + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -384,13 +425,14 @@ static noinstr void default_do_nmi(struct pt_regs *regs) pci_serr_error(reason, regs); else if (reason & NMI_REASON_IOCHK) io_check_error(reason, regs); -#ifdef CONFIG_X86_32 + /* * Reassert NMI in case it became active * meanwhile as it's edge-triggered: */ - reassert_nmi(); -#endif + if (IS_ENABLED(CONFIG_X86_32)) + reassert_nmi(); + __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); goto out; @@ -580,7 +622,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx); static char *nmi_check_stall_msg[] = { /* */ -/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */ /* | +------ cpu_is_offline(cpu) */ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ /* | | | NMI handler has been invoked. */ @@ -628,22 +670,26 @@ void nmi_backtrace_stall_check(const struct cpumask *btp) nmi_seq = READ_ONCE(nsp->idt_nmi_seq); if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { msgp = "CPU entered NMI handler function, but has not exited"; - } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { - msgp = "CPU is handling NMIs"; - } else { - idx = ((nsp->idt_seq_snap & 0x1) << 2) | + } else if (nsp->idt_nmi_seq_snap == nmi_seq || + nsp->idt_nmi_seq_snap + 1 == nmi_seq) { + idx = ((nmi_seq & 0x1) << 2) | (cpu_is_offline(cpu) << 1) | (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); msgp = nmi_check_stall_msg[idx]; if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) modp = ", but OK because ignore_nmis was set"; - if (nmi_seq & 0x1) - msghp = " (CPU currently in NMI handler function)"; - else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) msghp = " (CPU exited one NMI handler function)"; + else if (nmi_seq & 0x1) + msghp = " (CPU currently in NMI handler function)"; + else + msghp = " (CPU was never in an NMI handler function)"; + } else { + msgp = "CPU is handling NMIs"; } - pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", - __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp); + pr_alert("%s: last activity: %lu jiffies ago.\n", + __func__, j - READ_ONCE(nsp->recv_jiffies)); } } @@ -705,4 +751,3 @@ void local_touch_nmi(void) { __this_cpu_write(last_nmi_rip, 0); } -EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e93a8545c74d..a010e9d062bf 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * arch/x86/kernel/nmi-selftest.c - * * Testsuite for NMI: IPIs * * Started by Don Zickus: @@ -30,7 +28,6 @@ static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; static int __initdata testcase_total; static int __initdata testcase_successes; -static int __initdata expected_testcase_failures; static int __initdata unexpected_testcase_failures; static int __initdata unexpected_testcase_unknowns; @@ -120,26 +117,22 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk(KERN_CONT "FAILED |"); + pr_cont("FAILED |"); else if (nmi_fail == TIMEOUT) - printk(KERN_CONT "TIMEOUT|"); + pr_cont("TIMEOUT|"); else - printk(KERN_CONT "ERROR |"); + pr_cont("ERROR |"); dump_stack(); } else { testcase_successes++; - printk(KERN_CONT " ok |"); + pr_cont(" ok |"); } - testcase_total++; + pr_cont("\n"); + testcase_total++; reset_nmi(); } -static inline void __init print_testname(const char *testname) -{ - printk("%12s:", testname); -} - void __init nmi_selftest(void) { init_nmi_testsuite(); @@ -147,38 +140,25 @@ void __init nmi_selftest(void) /* * Run the testsuite: */ - printk("----------------\n"); - printk("| NMI testsuite:\n"); - printk("--------------------\n"); + pr_info("----------------\n"); + pr_info("| NMI testsuite:\n"); + pr_info("--------------------\n"); - print_testname("remote IPI"); + pr_info("%12s:", "remote IPI"); dotest(remote_ipi, SUCCESS); - printk(KERN_CONT "\n"); - print_testname("local IPI"); + + pr_info("%12s:", "local IPI"); dotest(local_ipi, SUCCESS); - printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); + pr_info("--------------------\n"); if (unexpected_testcase_failures) { - printk("--------------------\n"); - printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", unexpected_testcase_failures, testcase_total); - printk("-----------------------------------------------------------------\n"); - } else if (expected_testcase_failures && testcase_successes) { - printk("--------------------\n"); - printk("%3d out of %3d testcases failed, as expected. |\n", - expected_testcase_failures, testcase_total); - printk("----------------------------------------------------\n"); - } else if (expected_testcase_failures && !testcase_successes) { - printk("--------------------\n"); - printk("All %3d testcases failed, as expected. |\n", - expected_testcase_failures); - printk("----------------------------------------\n"); } else { - printk("--------------------\n"); - printk("Good, all %3d testcases passed! |\n", + pr_info("Good, all %3d testcases passed! |\n", testcase_successes); - printk("---------------------------------\n"); } + pr_info("-----------------------------------------------------------------\n"); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5358d43886ad..ab3e172dcc69 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -33,6 +33,7 @@ #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> +#include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); @@ -51,18 +52,12 @@ DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif -DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); +DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) && - !boot_cpu_has(X86_FEATURE_HYPERVISOR)) - static_branch_disable(&virt_spin_lock_key); -} - -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_page(tlb, table); + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + static_branch_enable(&virt_spin_lock_key); } struct static_key paravirt_steal_enabled; @@ -81,24 +76,9 @@ void paravirt_set_sched_clock(u64 (*func)(void)) static_call_update(pv_sched_clock, func); } -/* These are in entry.S */ -static struct resource reserve_ioports = { - .start = 0, - .end = IO_SPACE_LIMIT, - .name = "paravirt-ioport", - .flags = IORESOURCE_IO | IORESOURCE_BUSY, -}; - -/* - * Reserve the whole legacy IO space to prevent any legacy drivers - * from wasting time probing for their hardware. This is a fairly - * brute-force approach to disabling all non-virtual drivers. - * - * Note that this must be called very early to have any effect. - */ -int paravirt_disable_iospace(void) +static noinstr void pv_native_safe_halt(void) { - return request_resource(&ioport_resource, &reserve_ioports); + native_safe_halt(); } #ifdef CONFIG_PARAVIRT_XXL @@ -107,24 +87,24 @@ static noinstr void pv_native_write_cr2(unsigned long val) native_write_cr2(val); } -static noinstr unsigned long pv_native_get_debugreg(int regno) +static noinstr unsigned long pv_native_read_cr3(void) { - return native_get_debugreg(regno); + return __native_read_cr3(); } -static noinstr void pv_native_set_debugreg(int regno, unsigned long val) +static noinstr void pv_native_write_cr3(unsigned long cr3) { - native_set_debugreg(regno, val); + native_write_cr3(cr3); } -noinstr void pv_native_wbinvd(void) +static noinstr unsigned long pv_native_get_debugreg(int regno) { - native_wbinvd(); + return native_get_debugreg(regno); } -static noinstr void pv_native_safe_halt(void) +static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { - native_safe_halt(); + native_set_debugreg(regno, val); } #endif @@ -149,7 +129,6 @@ struct paravirt_patch_template pv_ops = { .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, - .cpu.wbinvd = pv_native_wbinvd, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, @@ -183,16 +162,17 @@ struct paravirt_patch_template pv_ops = { .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), +#endif /* CONFIG_PARAVIRT_XXL */ + + /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, -#endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = native_tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, @@ -200,8 +180,8 @@ struct paravirt_patch_template pv_ops = { #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, - .mmu.read_cr3 = __native_read_cr3, - .mmu.write_cr3 = native_write_cr3, + .mmu.read_cr3 = pv_native_read_cr3, + .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, @@ -231,12 +211,10 @@ struct paravirt_patch_template pv_ops = { .mmu.set_p4d = native_set_p4d, -#if CONFIG_PGTABLE_LEVELS >= 5 .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f323d83e40a7..6267363e0189 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void) swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags); } -/* - * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel - * parameter documentation. - */ static __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 319fef37d9dc..cc2c34ba7228 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -203,16 +203,6 @@ void __init probe_roms(void) unsigned char c; int i; - /* - * The ROM memory range is not part of the e820 table and is therefore not - * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted - * memory, and SNP requires encrypted memory to be validated before access. - * Do that here. - */ - snp_prep_memory(video_rom_resource.start, - ((system_rom_resource.end + 1) - video_rom_resource.start), - SNP_PAGE_STATE_PRIVATE); - /* video rom */ upper = adapter_rom_resources[0].start; for (start = video_rom_resource.start; start < upper; start += 2048) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b8441147eb5e..704883c21f3a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,6 +30,7 @@ #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> @@ -51,6 +52,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include <asm/shstk.h> #include "process.h" @@ -92,12 +94,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - memcpy(dst, src, arch_task_struct_size); + /* fpu_clone() will initialize the "dst_fpu" memory */ + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0); + #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - /* Drop the copied pointer to current's fpstate */ - dst->thread.fpu.fpstate = NULL; return 0; } @@ -105,8 +107,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { - if (fpu_state_size_dynamic()) - fpstate_free(&tsk->thread.fpu); + if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER))) + fpstate_free(x86_task_fpu(tsk)); } #endif @@ -116,7 +118,6 @@ void arch_release_task_struct(struct task_struct *tsk) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); @@ -124,7 +125,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); shstk_free(tsk); - fpu__drop(fpu); + fpu__drop(tsk); } static int set_new_tls(struct task_struct *p, unsigned long tls) @@ -175,6 +176,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; + clear_tsk_thread_flag(p, TIF_IO_BITMAP); p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -338,7 +340,7 @@ static void set_cpuid_faulting(bool on) msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) @@ -463,6 +465,11 @@ void native_tss_update_io_bitmap(void) } else { struct io_bitmap *iobm = t->io_bitmap; + if (WARN_ON_ONCE(!iobm)) { + clear_thread_flag(TIF_IO_BITMAP); + native_tss_invalidate_io_bitmap(); + } + /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. @@ -555,7 +562,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); return; } @@ -572,7 +579,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { @@ -582,7 +589,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } @@ -591,7 +598,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); } #endif @@ -601,7 +608,7 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* @@ -704,11 +711,11 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) arch_has_block_step()) { unsigned long debugctl, msk; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) @@ -825,7 +832,7 @@ void __noreturn stop_this_cpu(void *dummy) * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) - native_wbinvd(); + wbinvd(); /* * This brings a cache line back and dirties it, but @@ -835,11 +842,18 @@ void __noreturn stop_this_cpu(void *dummy) */ cpumask_clear_cpu(cpu, &cpus_stop_mask); +#ifdef CONFIG_SMP + if (smp_ops.stop_this_cpu) { + smp_ops.stop_this_cpu(); + BUG(); + } +#endif + for (;;) { /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the - * native_wbinvd() above. + * wbinvd() above. */ native_halt(); } @@ -870,7 +884,7 @@ static __init bool prefer_mwait_c1_over_halt(void) if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) return false; - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT @@ -894,13 +908,10 @@ static __init bool prefer_mwait_c1_over_halt(void) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - mb(); /* quirk */ - clflush((void *)¤t_thread_info()->flags); - mb(); /* quirk */ - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); @@ -926,7 +937,7 @@ void __init select_idle_routine(void) static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); - static_call_update(x86_idle, tdx_safe_halt); + static_call_update(x86_idle, tdx_halt); } else { static_call_update(x86_idle, default_idle); } @@ -1035,7 +1046,7 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(int option, unsigned long arg2) +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { switch (option) { case ARCH_GET_CPUID: @@ -1050,5 +1061,13 @@ long do_arch_prctl_common(int option, unsigned long arg2) return fpu_xstate_prctl(option, arg2); } + if (!in_ia32_syscall()) + return do_arch_prctl_64(current, option, arg2); + return -EINVAL; } + +SYSCALL_DEFINE0(ni_syscall) +{ + return -ENOSYS; +} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0917c7f25720..3ef15c2f152f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1)) return; printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", @@ -160,8 +160,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -190,13 +189,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0 and pcpu_hot.top_of_stack. This changes + * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); - this_cpu_write(pcpu_hot.top_of_stack, + this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); @@ -206,17 +205,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - raw_cpu_write(pcpu_hot.current_task, next_p); - - switch_fpu_finish(next_p); + raw_cpu_write(current_task, next_p); /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7062b84dd467..b972bf72fb8b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,6 +57,7 @@ #include <asm/unistd.h> #include <asm/fsgsbase.h> #include <asm/fred.h> +#include <asm/msr.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -95,8 +96,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, return; if (mode == SHOW_REGS_USER) { - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; @@ -107,9 +108,9 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_GS_BASE, gs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -132,14 +133,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400))) { + (d6 == DR6_RESERVED) && (d7 == DR7_FIXED_1))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", log_lvl, d3, d6, d7); } - if (cpu_feature_enabled(X86_FEATURE_OSPKE)) + if (cr4 & X86_CR4_PKE) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } @@ -195,7 +196,7 @@ static noinstr unsigned long __rdgsbase_inactive(void) native_swapgs(); } else { instrumentation_begin(); - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } @@ -221,7 +222,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) native_swapgs(); } else { instrumentation_begin(); - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } @@ -353,7 +354,7 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } else { if (prev_index != next_index) loadseg(which, next_index); - wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { @@ -463,7 +464,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; @@ -478,7 +479,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase) __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); } } @@ -614,10 +615,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && - this_cpu_read(pcpu_hot.hardirq_stack_inuse)); + this_cpu_read(hardirq_stack_inuse)); - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -668,10 +668,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - raw_cpu_write(pcpu_hot.current_task, next_p); - raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); - - switch_fpu_finish(next_p); + raw_cpu_write(current_task, next_p); + raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); /* Reload sp0. */ update_task_stack(next_p); @@ -707,7 +705,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } @@ -798,6 +796,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) #define LAM_U57_BITS 6 +static void enable_lam_func(void *__mm) +{ + struct mm_struct *mm = __mm; + unsigned long lam; + + if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { + lam = mm_lam_cr3_mask(mm); + write_cr3(__read_cr3() | lam); + cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); + } +} + +static void mm_enable_lam(struct mm_struct *mm) +{ + mm->context.lam_cr3_mask = X86_CR3_LAM_U57; + mm->context.untag_mask = ~GENMASK(62, 57); + + /* + * Even though the process must still be single-threaded at this + * point, kernel threads may be using the mm. IPI those kernel + * threads if they exist. + */ + on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); + set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); +} + static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) @@ -814,25 +838,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) if (mmap_write_lock_killable(mm)) return -EINTR; + /* + * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from + * being enabled unless the process is single threaded: + */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } - if (!nr_bits) { - mmap_write_unlock(mm); - return -EINVAL; - } else if (nr_bits <= LAM_U57_BITS) { - mm->context.lam_cr3_mask = X86_CR3_LAM_U57; - mm->context.untag_mask = ~GENMASK(62, 57); - } else { + if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } - write_cr3(__read_cr3() | mm->context.lam_cr3_mask); - set_tlbstate_lam_mode(mm); - set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); + mm_enable_lam(mm); mmap_write_unlock(mm); @@ -920,7 +940,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, arg2); # endif -# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +# ifdef CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, arg2); # endif @@ -957,26 +977,3 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) return ret; } - -SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - long ret; - - ret = do_arch_prctl_64(current, option, arg2); - if (ret == -EINVAL) - ret = do_arch_prctl_common(option, arg2); - - return ret; -} - -#ifdef CONFIG_IA32_EMULATION -COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) -{ - return do_arch_prctl_common(option, arg2); -} -#endif - -unsigned long KSTK_ESP(struct task_struct *task) -{ - return task_pt_regs(task)->sp; -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 6d0df6a58873..a92f18db9610 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -10,6 +10,8 @@ #include <asm/setup.h> #include <asm/mce.h> +#include <linux/platform_data/x86/apple.h> + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void quirk_intel_irqbalance(struct pci_dev *dev) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f3130f762784..964f6b0a3d68 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,7 @@ #include <linux/delay.h> #include <linux/objtool.h> #include <linux/pgtable.h> +#include <linux/kexec.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -529,7 +530,7 @@ static inline void kb_wait(void) static inline void nmi_shootdown_cpus_on_restart(void); -#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD) +#if IS_ENABLED(CONFIG_KVM_X86) /* RCU-protected callback to disable virtualization prior to reboot. */ static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback; @@ -599,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void) } #else static void emergency_reboot_disable_virtualization(void) { } -#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */ +#endif /* CONFIG_KVM_X86 */ void __attribute__((weak)) mach_reboot_fixups(void) { @@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void) void native_machine_shutdown(void) { + /* + * Call enc_kexec_begin() while all CPUs are still active and + * interrupts are enabled. This will allow all in-flight memory + * conversions to finish cleanly. + */ + if (kexec_in_progress) + x86_platform.guest.enc_kexec_begin(); + /* Stop the cpus and apics */ #ifdef CONFIG_X86_IO_APIC /* @@ -752,6 +761,9 @@ void native_machine_shutdown(void) #ifdef CONFIG_X86_64 x86_platform.iommu_shutdown(); #endif + + if (kexec_in_progress) + x86_platform.guest.enc_kexec_finish(); } static void __machine_emergency_restart(int emergency) @@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) cpu_emergency_disable_virtualization(); atomic_dec(&waiting_for_crash_ipi); + + if (smp_ops.stop_this_cpu) { + smp_ops.stop_this_cpu(); + BUG(); + } + /* Assume hlt works */ halt(); for (;;) @@ -903,20 +921,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) return; /* Make a note of crashing cpu. Will be used in NMI callback. */ - crashing_cpu = safe_smp_processor_id(); + crashing_cpu = smp_processor_id(); shootdown_callback = callback; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, - NMI_FLAG_FIRST, "crash")) - return; /* Return what? */ + /* - * Ensure the new callback function is set before sending - * out the NMI + * Set emergency handler to preempt other handlers. */ - wmb(); + set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback); apic_send_IPI_allbutself(NMI_VECTOR); diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index b7c0f142d026..4679ac0a03eb 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -27,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev) static void cs5536_warm_reset(struct pci_dev *dev) { /* writing 1 to the LSB of this MSR causes a hard reset */ - wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL); udelay(50); /* shouldn't get here but be safe and spin a while */ } diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c7c4b1917336..57276f134d12 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl %edx, %edi movl $1024, %ecx - rep ; movsl + rep movsl movl %ebp, %edi movl %eax, %esi movl $1024, %ecx - rep ; movsl + rep movsl movl %eax, %edi movl %edx, %esi movl $1024, %ecx - rep ; movsl + rep movsl lea PAGE_SIZE(%ebp), %esi jmp 0b diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 56cab1bb25f5..ea604f4d0b52 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -5,12 +5,15 @@ */ #include <linux/linkage.h> +#include <linux/stringify.h> +#include <asm/alternative.h> #include <asm/page_types.h> #include <asm/kexec.h> #include <asm/processor-flags.h> #include <asm/pgtable_types.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> +#include <asm/asm-offsets.h> /* * Must be relocatable PIC code callable as a C function, in particular @@ -21,33 +24,47 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) /* - * control_page + KEXEC_CONTROL_CODE_MAX_SIZE - * ~ control_page + PAGE_SIZE are used as data storage and stack for - * jumping back + * The .text..relocate_kernel and .data..relocate_kernel sections are copied + * into the control page, and the remainder of the page is used as the stack. */ -#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + .section .data..relocate_kernel,"a"; /* Minimal CPU state */ -#define RSP DATA(0x0) -#define CR0 DATA(0x8) -#define CR3 DATA(0x10) -#define CR4 DATA(0x18) - -/* other data */ -#define CP_PA_TABLE_PAGE DATA(0x20) -#define CP_PA_SWAP_PAGE DATA(0x28) -#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) - - .text - .align PAGE_SIZE +SYM_DATA_LOCAL(saved_rsp, .quad 0) +SYM_DATA_LOCAL(saved_cr0, .quad 0) +SYM_DATA_LOCAL(saved_cr3, .quad 0) +SYM_DATA_LOCAL(saved_cr4, .quad 0) + /* other data */ +SYM_DATA(kexec_va_control_page, .quad 0) +SYM_DATA(kexec_pa_table_page, .quad 0) +SYM_DATA(kexec_pa_swap_page, .quad 0) +SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) +SYM_DATA(kexec_debug_8250_mmio32, .quad 0) +SYM_DATA(kexec_debug_8250_port, .word 0) + + .balign 16 +SYM_DATA_START_LOCAL(kexec_debug_gdt) + .word kexec_debug_gdt_end - kexec_debug_gdt - 1 + .long 0 + .word 0 + .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + + .balign 8 +SYM_DATA_START(kexec_debug_idt) + .skip 0x100, 0x00 +SYM_DATA_END(kexec_debug_idt) + + .section .text..relocate_kernel,"ax"; .code64 -SYM_CODE_START_NOALIGN(relocate_range) SYM_CODE_START_NOALIGN(relocate_kernel) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR /* * %rdi indirection_page - * %rsi page_list + * %rsi pa_control_page * %rdx start address * %rcx preserve_context * %r8 host_mem_enc_active @@ -62,63 +79,87 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 - movq %rsp, RSP(%r11) - movq %cr0, %rax - movq %rax, CR0(%r11) - movq %cr3, %rax - movq %rax, CR3(%r11) - movq %cr4, %rax - movq %rax, CR4(%r11) - - /* Save CR4. Required to enable the right paging mode later. */ - movq %rax, %r13 + /* Invalidate GDT/IDT, zero out flags */ + pushq $0 + pushq $0 - /* zero out flags, and disable interrupts */ - pushq $0 + lidt (%rsp) + lgdt (%rsp) + addq $8, %rsp popfq - /* Save SME active flag */ - movq %r8, %r12 + /* Switch to the identity mapped page tables */ + movq %cr3, %rax + movq kexec_pa_table_page(%rip), %r9 + movq %r9, %cr3 - /* - * get physical address of control page now - * this is impossible after page table switch - */ - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + /* Leave CR4 in %r13 to enable the right paging mode later. */ + movq %cr4, %r13 - /* get physical address of page table now too */ - movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + /* Disable global pages immediately to ensure this mapping is RWX */ + movq %r13, %r12 + andq $~(X86_CR4_PGE), %r12 + movq %r12, %cr4 - /* get physical address of swap page now */ - movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + /* Save %rsp and CRs. */ + movq %r13, saved_cr4(%rip) + movq %rsp, saved_rsp(%rip) + movq %rax, saved_cr3(%rip) + movq %cr0, %rax + movq %rax, saved_cr0(%rip) - /* save some information for jumping back */ - movq %r9, CP_PA_TABLE_PAGE(%r11) - movq %r10, CP_PA_SWAP_PAGE(%r11) - movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + /* save indirection list for jumping back */ + movq %rdi, pa_backup_pages_map(%rip) - /* Switch to the identity mapped page tables */ - movq %r9, %cr3 + /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */ + movq %rcx, %r11 /* setup a new stack at the end of the physical control page */ - lea PAGE_SIZE(%r8), %rsp + lea PAGE_SIZE(%rsi), %rsp /* jump to identity mapped page */ - addq $(identity_mapped - relocate_kernel), %r8 - pushq %r8 - ANNOTATE_UNRET_SAFE - ret - int3 +0: addq $identity_mapped - 0b, %rsi + subq $__relocate_kernel_start - 0b, %rsi + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi SYM_CODE_END(relocate_kernel) SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) UNWIND_HINT_END_OF_STACK - /* set return address to 0 if not preserving context */ - pushq $0 + /* + * %rdi indirection page + * %rdx start address + * %r8 host_mem_enc_active + * %r9 page table page + * %r11 preserve_context + * %r13 original CR4 when relocate_kernel() was invoked + */ + /* store the start address on the stack */ pushq %rdx + /* Create a GDTR (16 bits limit, 64 bits addr) on stack */ + leaq kexec_debug_gdt(%rip), %rax + pushq %rax + pushw (%rax) + + /* Load the GDT, put the stack back */ + lgdt (%rsp) + addq $10, %rsp + + /* Test that we can load segments */ + movq %ds, %rax + movq %rax, %ds + + /* Now an IDTR on the stack to load the IDT the kernel created */ + leaq kexec_debug_idt(%rip), %rsi + pushq %rsi + pushw $0xff + lidt (%rsp) + addq $10, %rsp + + //int3 + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below. @@ -145,16 +186,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * Set cr4 to a known state: * - physical address extension enabled * - 5-level paging, if it was enabled before + * - Machine check exception on TDX guest, if it was enabled before. + * Clearing MCE might not be allowed in TDX guests, depending on setup. + * + * Use R13 that contains the original CR4 value, read in relocate_kernel(). + * PAE is always set in the original CR4. */ - movl $X86_CR4_PAE, %eax - testq $X86_CR4_LA57, %r13 - jz 1f - orl $X86_CR4_LA57, %eax -1: - movq %rax, %cr4 - - jmp 1f -1: + andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d + ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST + movq %r13, %cr4 /* Flush the TLB (needed?) */ movq %r9, %cr3 @@ -164,12 +204,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) * entries that will conflict with the now unencrypted memory * used by kexec. Flush the caches before copying the kernel. */ - testq %r12, %r12 - jz 1f + testq %r8, %r8 + jz .Lsme_off wbinvd -1: +.Lsme_off: - movq %rcx, %r11 call swap_pages /* @@ -181,13 +220,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %cr3, %rax movq %rax, %cr3 + testq %r11, %r11 /* preserve_context */ + jnz .Lrelocate + /* * set all of the registers to known values * leave %rsp alone */ - testq %r11, %r11 - jnz 1f xorl %eax, %eax xorl %ebx, %ebx xorl %ecx, %ecx @@ -208,22 +248,36 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) ret int3 -1: +.Lrelocate: popq %rdx + + /* Use the swap page for the callee's stack */ + movq kexec_pa_swap_page(%rip), %r10 leaq PAGE_SIZE(%r10), %rsp + + /* push the existing entry point onto the callee's stack */ + pushq %rdx + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ - movq 0(%rsp), %rbp - leaq relocate_kernel(%rip), %r8 - movq CP_PA_SWAP_PAGE(%r8), %r10 - movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi - movq CP_PA_TABLE_PAGE(%r8), %rax + popq %rbp + movq kexec_pa_swap_page(%rip), %r10 + movq pa_backup_pages_map(%rip), %rdi + movq kexec_pa_table_page(%rip), %rax movq %rax, %cr3 + + /* Find start (and end) of this physical mapping of control page */ + leaq (%rip), %r8 + ANNOTATE_NOENDBR + andq $PAGE_MASK, %r8 lea PAGE_SIZE(%r8), %rsp + movl $1, %r11d /* Ensure preserve_context flag is set */ call swap_pages - movq $virtual_mapped, %rax + movq kexec_va_control_page(%rip), %rax +0: addq $virtual_mapped - 0b, %rax + subq $__relocate_kernel_start - 0b, %rax pushq %rax ANNOTATE_UNRET_SAFE ret @@ -233,13 +287,21 @@ SYM_CODE_END(identity_mapped) SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped) UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR // RET target, above - movq RSP(%r8), %rsp - movq CR4(%r8), %rax + movq saved_rsp(%rip), %rsp + movq saved_cr4(%rip), %rax movq %rax, %cr4 - movq CR3(%r8), %rax - movq CR0(%r8), %r8 + movq saved_cr3(%rip), %rax + movq saved_cr0(%rip), %r8 movq %rax, %cr3 movq %r8, %cr0 + +#ifdef CONFIG_KEXEC_JUMP + /* Saved in save_processor_state. */ + movq $saved_context, %rax + lgdt saved_context_gdt_desc(%rax) +#endif + + /* relocate_kernel() returns the re-entry point for next time */ movq %rbp, %rax popf @@ -257,61 +319,288 @@ SYM_CODE_END(virtual_mapped) /* Do the copies */ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) UNWIND_HINT_END_OF_STACK - movq %rdi, %rcx /* Put the page_list in %rcx */ + /* + * %rdi indirection page + * %r11 preserve_context + */ + movq %rdi, %rcx /* Put the indirection_page in %rcx */ xorl %edi, %edi xorl %esi, %esi - jmp 1f + jmp .Lstart /* Should start with an indirection record */ -0: /* top, read another word for the indirection page */ +.Lloop: /* top, read another word for the indirection page */ movq (%rbx), %rcx addq $8, %rbx -1: +.Lstart: testb $0x1, %cl /* is it a destination page? */ - jz 2f + jz .Lnotdest movq %rcx, %rdi andq $0xfffffffffffff000, %rdi - jmp 0b -2: + jmp .Lloop +.Lnotdest: testb $0x2, %cl /* is it an indirection page? */ - jz 2f + jz .Lnotind movq %rcx, %rbx andq $0xfffffffffffff000, %rbx - jmp 0b -2: + jmp .Lloop +.Lnotind: testb $0x4, %cl /* is it the done indicator? */ - jz 2f - jmp 3f -2: + jz .Lnotdone + jmp .Ldone +.Lnotdone: testb $0x8, %cl /* is it the source indicator? */ - jz 0b /* Ignore it otherwise */ + jz .Lloop /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi - movq %rdi, %rdx - movq %rsi, %rax + movq %rdi, %rdx /* Save destination page to %rdx */ + movq %rsi, %rax /* Save source page to %rax */ + + testq %r11, %r11 /* Only actually swap for ::preserve_context */ + jz .Lnoswap - movq %r10, %rdi + /* copy source page to swap page */ + movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx - rep ; movsq + rep movsq + /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx - rep ; movsq + rep movsq + /* copy swap page to destination page */ movq %rdx, %rdi - movq %r10, %rsi + movq kexec_pa_swap_page(%rip), %rsi +.Lnoswap: movl $512, %ecx - rep ; movsq + rep movsq lea PAGE_SIZE(%rax), %rsi - jmp 0b -3: + jmp .Lloop +.Ldone: ANNOTATE_UNRET_SAFE ret int3 SYM_CODE_END(swap_pages) - .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc -SYM_CODE_END(relocate_range); +/* + * Generic 'print character' routine + * - %al: Character to be printed (may clobber %rax) + * - %rdx: MMIO address or port. + */ +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + addw $LSR, %dx + xchg %al, %ah +.Lxmtrdy_loop: + inb %dx, %al + testb $XMTRDY, %al + jnz .Lready + pause + jmp .Lxmtrdy_loop + +.Lready: + subw $LSR, %dx + xchg %al, %ah + outb %al, %dx +pr_char_null: + ANNOTATE_NOENDBR + + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250) + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR +.Lxmtrdy_loop_mmio: + movb (LSR*4)(%rdx), %ah + testb $XMTRDY, %ah + jnz .Lready_mmio + pause + jmp .Lxmtrdy_loop_mmio + +.Lready_mmio: + movb %al, (%rdx) + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250_mmio32) + +/* + * Load pr_char function pointer into %rsi and load %rdx with whatever + * that function wants to see there (typically port/MMIO address). + */ +.macro pr_setup + leaq pr_char_8250(%rip), %rsi + movw kexec_debug_8250_port(%rip), %dx + testw %dx, %dx + jnz 1f + + leaq pr_char_8250_mmio32(%rip), %rsi + movq kexec_debug_8250_mmio32(%rip), %rdx + testq %rdx, %rdx + jnz 1f + + leaq pr_char_null(%rip), %rsi +1: +.endm + +/* Print the nybble in %bl, clobber %rax */ +SYM_CODE_START_LOCAL_NOALIGN(pr_nybble) + UNWIND_HINT_FUNC + movb %bl, %al + nop + andb $0x0f, %al + addb $0x30, %al + cmpb $0x3a, %al + jb 1f + addb $('a' - '0' - 10), %al + ANNOTATE_RETPOLINE_SAFE +1: jmp *%rsi +SYM_CODE_END(pr_nybble) + +SYM_CODE_START_LOCAL_NOALIGN(pr_qword) + UNWIND_HINT_FUNC + movq $16, %rcx +1: rolq $4, %rbx + call pr_nybble + loop 1b + movb $'\n', %al + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi +SYM_CODE_END(pr_qword) + +.macro print_reg a, b, c, d, r + movb $\a, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\b, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\c, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\d, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movq \r, %rbx + call pr_qword +.endm + +SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors) + /* Each of these is 6 bytes. */ +.macro vec_err exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + nop + nop + pushq $\exc + jmp exc_handler +.endm + +.macro vec_noerr exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + pushq $0 + pushq $\exc + jmp exc_handler +.endm + + ANNOTATE_NOENDBR + vec_noerr 0 // #DE + vec_noerr 1 // #DB + vec_noerr 2 // #NMI + vec_noerr 3 // #BP + vec_noerr 4 // #OF + vec_noerr 5 // #BR + vec_noerr 6 // #UD + vec_noerr 7 // #NM + vec_err 8 // #DF + vec_noerr 9 + vec_err 10 // #TS + vec_err 11 // #NP + vec_err 12 // #SS + vec_err 13 // #GP + vec_err 14 // #PF + vec_noerr 15 +SYM_CODE_END(kexec_debug_exc_vectors) + +SYM_CODE_START_LOCAL_NOALIGN(exc_handler) + /* No need for RET mitigations during kexec */ + VALIDATE_UNRET_END + + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdx + pushq %rsi + + /* Stack frame */ +#define EXC_SS 0x58 /* Architectural... */ +#define EXC_RSP 0x50 +#define EXC_EFLAGS 0x48 +#define EXC_CS 0x40 +#define EXC_RIP 0x38 +#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */ +#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */ +#define EXC_RAX 0x20 /* Pushed just above in exc_handler */ +#define EXC_RBX 0x18 +#define EXC_RCX 0x10 +#define EXC_RDX 0x08 +#define EXC_RSI 0x00 + + /* Set up %rdx/%rsi for debug output */ + pr_setup + + /* rip and exception info */ + print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp) + print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp) + print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp) + print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp) + + /* We spilled these to the stack */ + print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp) + print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp) + print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp) + print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp) + print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp) + + /* Other registers untouched */ + print_reg 'r', 'd', 'i', ':', %rdi + print_reg 'r', '8', ' ', ':', %r8 + print_reg 'r', '9', ' ', ':', %r9 + print_reg 'r', '1', '0', ':', %r10 + print_reg 'r', '1', '1', ':', %r11 + print_reg 'r', '1', '2', ':', %r12 + print_reg 'r', '1', '3', ':', %r13 + print_reg 'r', '1', '4', ':', %r14 + print_reg 'r', '1', '5', ':', %r15 + print_reg 'c', 'r', '2', ':', %cr2 + + /* Only return from INT3 */ + cmpq $3, EXC_EXCEPTION(%rsp) + jne .Ldie + + popq %rsi + popq %rdx + popq %rcx + popq %rbx + popq %rax + + addq $16, %rsp + iretq + +.Ldie: + hlt + jmp .Ldie + +SYM_CODE_END(exc_handler) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 2e7066980f3e..51a849a79c98 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -10,7 +10,6 @@ #include <asm/vsyscall.h> #include <asm/x86_init.h> #include <asm/time.h> -#include <asm/intel-mid.h> #include <asm/setup.h> #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ef206500ed6f..fb27be697128 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -7,10 +7,11 @@ */ #include <linux/acpi.h> #include <linux/console.h> +#include <linux/cpu.h> #include <linux/crash_dump.h> #include <linux/dma-map-ops.h> -#include <linux/dmi.h> #include <linux/efi.h> +#include <linux/hugetlb.h> #include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> @@ -18,24 +19,23 @@ #include <linux/memblock.h> #include <linux/panic_notifier.h> #include <linux/pci.h> +#include <linux/random.h> #include <linux/root_dev.h> -#include <linux/hugetlb.h> -#include <linux/tboot.h> -#include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> #include <linux/swiotlb.h> -#include <linux/random.h> +#include <linux/tboot.h> +#include <linux/usb/xhci-dbgp.h> +#include <linux/vmalloc.h> #include <uapi/linux/mount.h> #include <xen/xen.h> #include <asm/apic.h> -#include <asm/efi.h> -#include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cacheinfo.h> +#include <asm/coco.h> #include <asm/cpu.h> #include <asm/efi.h> #include <asm/gart.h> @@ -46,15 +46,16 @@ #include <asm/mce.h> #include <asm/memtype.h> #include <asm/mtrr.h> -#include <asm/realmode.h> +#include <asm/nmi.h> +#include <asm/numa.h> #include <asm/olpc_ofw.h> #include <asm/pci-direct.h> #include <asm/prom.h> #include <asm/proto.h> +#include <asm/realmode.h> #include <asm/thermal.h> #include <asm/unwind.h> #include <asm/vsyscall.h> -#include <linux/vmalloc.h> /* * max_low_pfn_mapped: highest directly mapped pfn < 4 GB @@ -130,6 +131,7 @@ struct ist_info ist_info; struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +SYM_PIC_ALIAS(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) __visible unsigned long mmu_cr4_features __ro_after_init; @@ -145,6 +147,67 @@ static size_t ima_kexec_buffer_size; /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ int bootloader_type, bootloader_version; +static const struct ctl_table x86_sysctl_table[] = { + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_ACPI_SLEEP) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +}; + +static int __init init_x86_sysctl(void) +{ + register_sysctl_init("kernel", x86_sysctl_table); + return 0; +} +arch_initcall(init_x86_sysctl); + /* * Setup options */ @@ -163,7 +226,8 @@ unsigned long saved_video_mode; static char __initdata command_line[COMMAND_LINE_SIZE]; #ifdef CONFIG_CMDLINE_BOOL -static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +bool builtin_cmdline_added __ro_after_init; #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -218,8 +282,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa_symbol(_brk_start), - _brk_end - _brk_start); + memblock_reserve_kern(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -257,6 +321,7 @@ static void __init relocate_initrd(void) u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); + int ret = 0; /* We need to move the initrd down into directly mapped mem */ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0, @@ -270,7 +335,9 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); + if (ret) + panic("Copy RAMDISK failed\n"); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", @@ -289,7 +356,7 @@ static void __init early_reserve_initrd(void) !ramdisk_image || !ramdisk_size) return; /* No initrd provided by bootloader */ - memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + memblock_reserve_kern(ramdisk_image, ramdisk_end - ramdisk_image); } static void __init reserve_initrd(void) @@ -342,7 +409,7 @@ static void __init add_early_ima_buffer(u64 phys_addr) } if (data->size) { - memblock_reserve(data->addr, data->size); + memblock_reserve_kern(data->addr, data->size); ima_kexec_buffer_phys = data->addr; ima_kexec_buffer_size = data->size; } @@ -380,6 +447,29 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) } #endif +static void __init add_kho(u64 phys_addr, u32 data_len) +{ + struct kho_data *kho; + u64 addr = phys_addr + sizeof(struct setup_data); + u64 size = data_len - sizeof(struct setup_data); + + if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER)) { + pr_warn("Passed KHO data, but CONFIG_KEXEC_HANDOVER not set. Ignoring.\n"); + return; + } + + kho = early_memremap(addr, size); + if (!kho) { + pr_warn("setup: failed to memremap kho data (0x%llx, 0x%llx)\n", + addr, size); + return; + } + + kho_populate(kho->fdt_addr, kho->fdt_size, kho->scratch_addr, kho->scratch_size); + + early_memunmap(kho, size); +} + static void __init parse_setup_data(void) { struct setup_data *data; @@ -408,6 +498,9 @@ static void __init parse_setup_data(void) case SETUP_IMA: add_early_ima_buffer(pa_data); break; + case SETUP_KEXEC_KHO: + add_kho(pa_data, data_len); + break; case SETUP_RNG_SEED: data = early_memremap(pa_data, data_len); add_bootloader_randomness(data->data, data->len); @@ -424,6 +517,46 @@ static void __init parse_setup_data(void) } } +/* + * Translate the fields of 'struct boot_param' into global variables + * representing these parameters. + */ +static void __init parse_boot_params(void) +{ + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI32_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + EFI64_LOADER_SIGNATURE, 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } +#endif + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; +} + static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_indirect *indirect; @@ -442,7 +575,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) len = sizeof(*data); pa_next = data->next; - memblock_reserve(pa_data, sizeof(*data) + data->len); + memblock_reserve_kern(pa_data, sizeof(*data) + data->len); if (data->type == SETUP_INDIRECT) { len += data->len; @@ -456,7 +589,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) indirect = (struct setup_indirect *)data->data; if (indirect->type != SETUP_INDIRECT) - memblock_reserve(indirect->addr, indirect->len); + memblock_reserve_kern(indirect->addr, indirect->len); } pa_data = pa_next; @@ -467,14 +600,13 @@ static void __init memblock_x86_reserve_range_setup_data(void) static void __init arch_reserve_crashkernel(void) { unsigned long long crash_base, crash_size, low_size = 0; - char *cmdline = boot_command_line; bool high = false; int ret; if (!IS_ENABLED(CONFIG_CRASH_RESERVE)) return; - ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base, &low_size, &high); if (ret) @@ -485,8 +617,7 @@ static void __init arch_reserve_crashkernel(void) return; } - reserve_crashkernel_generic(cmdline, crash_size, crash_base, - low_size, high); + reserve_crashkernel_generic(crash_size, crash_base, low_size, high); } static struct resource standard_io_resources[] = { @@ -522,6 +653,23 @@ void __init reserve_standard_io_resources(void) } +static void __init setup_kernel_resources(void) +{ + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + rodata_resource.start = __pa_symbol(__start_rodata); + rodata_resource.end = __pa_symbol(__end_rodata)-1; + data_resource.start = __pa_symbol(_sdata); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &rodata_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); +} + static bool __init snb_gfx_workaround_needed(void) { #ifdef CONFIG_PCI @@ -644,8 +792,8 @@ static void __init early_reserve_memory(void) * __end_of_kernel_reserve symbol must be explicitly reserved with a * separate memblock_reserve() or they will be discarded. */ - memblock_reserve(__pa_symbol(_text), - (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); + memblock_reserve_kern(__pa_symbol(_text), + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * The first 4Kb of memory is a BIOS owned area, but generally it is @@ -753,6 +901,23 @@ void __init setup_arch(char **cmdline_p) boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS; #endif +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif + builtin_cmdline_added = true; +#endif + + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + /* * If we have OLPC OFW, we might end up relocating the fixmap due to * reserve_top(), so do this before touching the ioremap area. @@ -767,35 +932,7 @@ void __init setup_arch(char **cmdline_p) setup_olpc_ofw_pgd(); - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); - screen_info = boot_params.screen_info; - edid_info = boot_params.edid_info; -#ifdef CONFIG_X86_32 - apm_info.bios = boot_params.apm_bios_info; - ist_info = boot_params.ist_info; -#endif - saved_video_mode = boot_params.hdr.vid_mode; - bootloader_type = boot_params.hdr.type_of_loader; - if ((bootloader_type >> 4) == 0xe) { - bootloader_type &= 0xf; - bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; - } - bootloader_version = bootloader_type & 0xf; - bootloader_version |= boot_params.hdr.ext_loader_ver << 4; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; -#endif -#ifdef CONFIG_EFI - if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI32_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, - EFI64_LOADER_SIGNATURE, 4)) { - set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); - } -#endif + parse_boot_params(); x86_init.oem.arch_setup(); @@ -819,35 +956,8 @@ void __init setup_arch(char **cmdline_p) copy_edd(); - if (!boot_params.hdr.root_flags) - root_mountflags &= ~MS_RDONLY; setup_initial_init_mm(_text, _etext, _edata, (void *)_brk_end); - code_resource.start = __pa_symbol(_text); - code_resource.end = __pa_symbol(_etext)-1; - rodata_resource.start = __pa_symbol(__start_rodata); - rodata_resource.end = __pa_symbol(__end_rodata)-1; - data_resource.start = __pa_symbol(_sdata); - data_resource.end = __pa_symbol(_edata)-1; - bss_resource.start = __pa_symbol(__bss_start); - bss_resource.end = __pa_symbol(__bss_stop)-1; - -#ifdef CONFIG_CMDLINE_BOOL -#ifdef CONFIG_CMDLINE_OVERRIDE - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); -#else - if (builtin_cmdline[0]) { - /* append boot loader cmdline to builtin */ - strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); - strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); - strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); - } -#endif -#endif - - strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug @@ -860,30 +970,6 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_memblock_x86_reserve_range(); -#ifdef CONFIG_MEMORY_HOTPLUG - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - if (movable_node_is_enabled()) - memblock_set_bottom_up(true); -#endif - x86_report_nx(); apic_setup_apic_calls(); @@ -895,14 +981,13 @@ void __init setup_arch(char **cmdline_p) setup_clear_cpu_cap(X86_FEATURE_APIC); } - e820__reserve_setup_data(); e820__finish_early_params(); if (efi_enabled(EFI_BOOT)) efi_init(); reserve_ibft_region(); - dmi_setup(); + x86_init.resources.dmi_setup(); /* * VMware detection requires dmi to be available, so this @@ -915,11 +1000,11 @@ void __init setup_arch(char **cmdline_p) tsc_early_init(); x86_init.resources.probe_roms(); - /* after parse_early_param, so could debug it */ - insert_resource(&iomem_resource, &code_resource); - insert_resource(&iomem_resource, &rodata_resource); - insert_resource(&iomem_resource, &data_resource); - insert_resource(&iomem_resource, &bss_resource); + /* + * Add resources for kernel text and data to the iomem_resource. + * Do it after parse_early_param, so it can be debugged. + */ + setup_kernel_resources(); e820_add_kernel_range(); trim_bios_range(); @@ -966,8 +1051,6 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; - - high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif /* Find and reserve MPTABLE area */ @@ -984,7 +1067,6 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); - memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); /* @@ -992,8 +1074,8 @@ void __init setup_arch(char **cmdline_p) * memory size. */ mem_encrypt_setup_arch(); + cc_random_init(); - efi_fake_memmap(); efi_find_mirror(); efi_esrt_init(); efi_mokvar_table_init(); @@ -1036,7 +1118,12 @@ void __init setup_arch(char **cmdline_p) init_mem_mapping(); - idt_setup_early_pf(); + /* + * init_mem_mapping() relies on the early IDT page fault handling. + * Now either enable FRED or install the real page fault handler + * for 64-bit in the IDT. + */ + cpu_init_replace_early_idt(); /* * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) @@ -1097,8 +1184,10 @@ void __init setup_arch(char **cmdline_p) initmem_init(); dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); - if (boot_cpu_has(X86_FEATURE_GBPAGES)) + if (boot_cpu_has(X86_FEATURE_GBPAGES)) { hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); + hugetlb_bootmem_alloc(); + } /* * Reserve memory for crash kernel after SRAT is parsed so that it @@ -1106,8 +1195,6 @@ void __init setup_arch(char **cmdline_p) */ arch_reserve_crashkernel(); - memblock_find_dma_reserve(); - if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1217,3 +1304,10 @@ static int __init register_kernel_offset_dumper(void) return 0; } __initcall(register_kernel_offset_dumper); + +#ifdef CONFIG_HOTPLUG_CPU +bool arch_cpu_is_hotpluggable(int cpu) +{ + return cpu > 0; +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b30d6e180df7..bfa48e7a32a2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -23,18 +23,13 @@ #include <asm/cpumask.h> #include <asm/cpu.h> -#ifdef CONFIG_X86_64 -#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) -#else -#define BOOT_PERCPU_OFFSET 0 -#endif +DEFINE_PER_CPU_CACHE_HOT(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); -DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +DEFINE_PER_CPU_CACHE_HOT(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { - [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, -}; +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init; EXPORT_SYMBOL(__per_cpu_offset); /* @@ -169,7 +164,7 @@ void __init setup_per_cpu_areas(void) for_each_possible_cpu(cpu) { per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); - per_cpu(pcpu_hot.cpu_number, cpu) = cpu; + per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); /* * Copy data used in early init routines from the diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c deleted file mode 100644 index 8b04958da5e7..000000000000 --- a/arch/x86/kernel/sev-shared.c +++ /dev/null @@ -1,1269 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * AMD Encrypted Register State Support - * - * Author: Joerg Roedel <jroedel@suse.de> - * - * This file is not compiled stand-alone. It contains code shared - * between the pre-decompression boot code and the running Linux kernel - * and is included directly into both code-bases. - */ - -#include <asm/setup_data.h> - -#ifndef __BOOT_COMPRESSED -#define error(v) pr_err(v) -#define has_cpuflag(f) boot_cpu_has(f) -#define sev_printk(fmt, ...) printk(fmt, ##__VA_ARGS__) -#define sev_printk_rtl(fmt, ...) printk_ratelimited(fmt, ##__VA_ARGS__) -#else -#undef WARN -#define WARN(condition, format...) (!!(condition)) -#define sev_printk(fmt, ...) -#define sev_printk_rtl(fmt, ...) -#endif - -/* I/O parameters for CPUID-related helpers */ -struct cpuid_leaf { - u32 fn; - u32 subfn; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; -}; - -/* - * Individual entries of the SNP CPUID table, as defined by the SNP - * Firmware ABI, Revision 0.9, Section 7.1, Table 14. - */ -struct snp_cpuid_fn { - u32 eax_in; - u32 ecx_in; - u64 xcr0_in; - u64 xss_in; - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u64 __reserved; -} __packed; - -/* - * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, - * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit - * of 64 entries per CPUID table. - */ -#define SNP_CPUID_COUNT_MAX 64 - -struct snp_cpuid_table { - u32 count; - u32 __reserved1; - u64 __reserved2; - struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; -} __packed; - -/* - * Since feature negotiation related variables are set early in the boot - * process they must reside in the .data section so as not to be zeroed - * out when the .bss section is later cleared. - * - * GHCB protocol version negotiated with the hypervisor. - */ -static u16 ghcb_version __ro_after_init; - -/* Copy of the SNP firmware's CPUID page. */ -static struct snp_cpuid_table cpuid_table_copy __ro_after_init; - -/* - * These will be initialized based on CPUID table so that non-present - * all-zero leaves (for sparse tables) can be differentiated from - * invalid/out-of-range leaves. This is needed since all-zero leaves - * still need to be post-processed. - */ -static u32 cpuid_std_range_max __ro_after_init; -static u32 cpuid_hyp_range_max __ro_after_init; -static u32 cpuid_ext_range_max __ro_after_init; - -static bool __init sev_es_check_cpu_features(void) -{ - if (!has_cpuflag(X86_FEATURE_RDRAND)) { - error("RDRAND instruction not supported - no trusted source of randomness available\n"); - return false; - } - - return true; -} - -static void __head __noreturn -sev_es_terminate(unsigned int set, unsigned int reason) -{ - u64 val = GHCB_MSR_TERM_REQ; - - /* Tell the hypervisor what went wrong. */ - val |= GHCB_SEV_TERM_REASON(set, reason); - - /* Request Guest Termination from Hypervisor */ - sev_es_wr_ghcb_msr(val); - VMGEXIT(); - - while (true) - asm volatile("hlt\n" : : : "memory"); -} - -/* - * The hypervisor features are available from GHCB version 2 onward. - */ -static u64 get_hv_features(void) -{ - u64 val; - - if (ghcb_version < 2) - return 0; - - sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) - return 0; - - return GHCB_MSR_HV_FT_RESP_VAL(val); -} - -static void snp_register_ghcb_early(unsigned long paddr) -{ - unsigned long pfn = paddr >> PAGE_SHIFT; - u64 val; - - sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - /* If the response GPA is not ours then abort the guest */ - if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || - (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); -} - -static bool sev_es_negotiate_protocol(void) -{ - u64 val; - - /* Do the GHCB protocol version negotiation */ - sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - - if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) - return false; - - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) - return false; - - ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); - - return true; -} - -static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb) -{ - ghcb->save.sw_exit_code = 0; - __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); -} - -static bool vc_decoding_needed(unsigned long exit_code) -{ - /* Exceptions don't require to decode the instruction */ - return !(exit_code >= SVM_EXIT_EXCP_BASE && - exit_code <= SVM_EXIT_LAST_EXCP); -} - -static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt, - struct pt_regs *regs, - unsigned long exit_code) -{ - enum es_result ret = ES_OK; - - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->regs = regs; - - if (vc_decoding_needed(exit_code)) - ret = vc_decode_insn(ctxt); - - return ret; -} - -static void vc_finish_insn(struct es_em_ctxt *ctxt) -{ - ctxt->regs->ip += ctxt->insn.length; -} - -static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - u32 ret; - - ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0); - if (!ret) - return ES_OK; - - if (ret == 1) { - u64 info = ghcb->save.sw_exit_info_2; - unsigned long v = info & SVM_EVTINJ_VEC_MASK; - - /* Check if exception information from hypervisor is sane. */ - if ((info & SVM_EVTINJ_VALID) && - ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) && - ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) { - ctxt->fi.vector = v; - - if (info & SVM_EVTINJ_VALID_ERR) - ctxt->fi.error_code = info >> 32; - - return ES_EXCEPTION; - } - } - - return ES_VMM_ERROR; -} - -static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - u64 exit_code, u64 exit_info_1, - u64 exit_info_2) -{ - /* Fill in protocol and format specifiers */ - ghcb->protocol_version = ghcb_version; - ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; - - ghcb_set_sw_exit_code(ghcb, exit_code); - ghcb_set_sw_exit_info_1(ghcb, exit_info_1); - ghcb_set_sw_exit_info_2(ghcb, exit_info_2); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - return verify_exception_info(ghcb, ctxt); -} - -static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg) -{ - u64 val; - - sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx)); - VMGEXIT(); - val = sev_es_rd_ghcb_msr(); - if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP) - return -EIO; - - *reg = (val >> 32); - - return 0; -} - -static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf) -{ - int ret; - - /* - * MSR protocol does not support fetching non-zero subfunctions, but is - * sufficient to handle current early-boot cases. Should that change, - * make sure to report an error rather than ignoring the index and - * grabbing random values. If this issue arises in the future, handling - * can be added here to use GHCB-page protocol for cases that occur late - * enough in boot that GHCB page is available. - */ - if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn) - return -EINVAL; - - ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx); - ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx); - - return ret; -} - -static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - u32 cr4 = native_read_cr4(); - int ret; - - ghcb_set_rax(ghcb, leaf->fn); - ghcb_set_rcx(ghcb, leaf->subfn); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #UD - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - leaf->eax = ghcb->save.rax; - leaf->ebx = ghcb->save.rbx; - leaf->ecx = ghcb->save.rcx; - leaf->edx = ghcb->save.rdx; - - return ES_OK; -} - -static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf) - : __sev_cpuid_hv_msr(leaf); -} - -/* - * This may be called early while still running on the initial identity - * mapping. Use RIP-relative addressing to obtain the correct address - * while running with the initial identity mapping as well as the - * switch-over to kernel virtual addresses later. - */ -static const struct snp_cpuid_table *snp_cpuid_get_table(void) -{ - return &RIP_REL_REF(cpuid_table_copy); -} - -/* - * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of - * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 - * and 1 based on the corresponding features enabled by a particular - * combination of XCR0 and XSS registers so that a guest can look up the - * version corresponding to the features currently enabled in its XCR0/XSS - * registers. The only values that differ between these versions/table - * entries is the enabled XSAVE area size advertised via EBX. - * - * While hypervisors may choose to make use of this support, it is more - * robust/secure for a guest to simply find the entry corresponding to the - * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the - * XSAVE area size using subfunctions 2 through 64, as documented in APM - * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. - * - * Since base/legacy XSAVE area size is documented as 0x240, use that value - * directly rather than relying on the base size in the CPUID table. - * - * Return: XSAVE area size on success, 0 otherwise. - */ -static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - u64 xfeatures_found = 0; - u32 xsave_size = 0x240; - int i; - - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; - - if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) - continue; - if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) - continue; - if (xfeatures_found & (BIT_ULL(e->ecx_in))) - continue; - - xfeatures_found |= (BIT_ULL(e->ecx_in)); - - if (compacted) - xsave_size += e->eax; - else - xsave_size = max(xsave_size, e->eax + e->ebx); - } - - /* - * Either the guest set unsupported XCR0/XSS bits, or the corresponding - * entries in the CPUID table were not present. This is not a valid - * state to be in. - */ - if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) - return 0; - - return xsave_size; -} - -static bool __head -snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - int i; - - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; - - if (e->eax_in != leaf->fn) - continue; - - if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) - continue; - - /* - * For 0xD subfunctions 0 and 1, only use the entry corresponding - * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). - * See the comments above snp_cpuid_calc_xsave_size() for more - * details. - */ - if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) - if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) - continue; - - leaf->eax = e->eax; - leaf->ebx = e->ebx; - leaf->ecx = e->ecx; - leaf->edx = e->edx; - - return true; - } - - return false; -} - -static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - if (sev_cpuid_hv(ghcb, ctxt, leaf)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); -} - -static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - struct cpuid_leaf *leaf) -{ - struct cpuid_leaf leaf_hv = *leaf; - - switch (leaf->fn) { - case 0x1: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* initial APIC ID */ - leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); - /* APIC enabled bit */ - leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); - - /* OSXSAVE enabled bit */ - if (native_read_cr4() & X86_CR4_OSXSAVE) - leaf->ecx |= BIT(27); - break; - case 0x7: - /* OSPKE enabled bit */ - leaf->ecx &= ~BIT(4); - if (native_read_cr4() & X86_CR4_PKE) - leaf->ecx |= BIT(4); - break; - case 0xB: - leaf_hv.subfn = 0; - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* extended APIC ID */ - leaf->edx = leaf_hv.edx; - break; - case 0xD: { - bool compacted = false; - u64 xcr0 = 1, xss = 0; - u32 xsave_size; - - if (leaf->subfn != 0 && leaf->subfn != 1) - return 0; - - if (native_read_cr4() & X86_CR4_OSXSAVE) - xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - if (leaf->subfn == 1) { - /* Get XSS value if XSAVES is enabled. */ - if (leaf->eax & BIT(3)) { - unsigned long lo, hi; - - asm volatile("rdmsr" : "=a" (lo), "=d" (hi) - : "c" (MSR_IA32_XSS)); - xss = (hi << 32) | lo; - } - - /* - * The PPR and APM aren't clear on what size should be - * encoded in 0xD:0x1:EBX when compaction is not enabled - * by either XSAVEC (feature bit 1) or XSAVES (feature - * bit 3) since SNP-capable hardware has these feature - * bits fixed as 1. KVM sets it to 0 in this case, but - * to avoid this becoming an issue it's safer to simply - * treat this as unsupported for SNP guests. - */ - if (!(leaf->eax & (BIT(1) | BIT(3)))) - return -EINVAL; - - compacted = true; - } - - xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); - if (!xsave_size) - return -EINVAL; - - leaf->ebx = xsave_size; - } - break; - case 0x8000001E: - snp_cpuid_hv(ghcb, ctxt, &leaf_hv); - - /* extended APIC ID */ - leaf->eax = leaf_hv.eax; - /* compute ID */ - leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); - /* node ID */ - leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); - break; - default: - /* No fix-ups needed, use values as-is. */ - break; - } - - return 0; -} - -/* - * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value - * should be treated as fatal by caller. - */ -static int __head -snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - - if (!cpuid_table->count) - return -EOPNOTSUPP; - - if (!snp_cpuid_get_validated_func(leaf)) { - /* - * Some hypervisors will avoid keeping track of CPUID entries - * where all values are zero, since they can be handled the - * same as out-of-range values (all-zero). This is useful here - * as well as it allows virtually all guest configurations to - * work using a single SNP CPUID table. - * - * To allow for this, there is a need to distinguish between - * out-of-range entries and in-range zero entries, since the - * CPUID table entries are only a template that may need to be - * augmented with additional values for things like - * CPU-specific information during post-processing. So if it's - * not in the table, set the values to zero. Then, if they are - * within a valid CPUID range, proceed with post-processing - * using zeros as the initial values. Otherwise, skip - * post-processing and just return zeros immediately. - */ - leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; - - /* Skip post-processing for out-of-range zero leafs. */ - if (!(leaf->fn <= RIP_REL_REF(cpuid_std_range_max) || - (leaf->fn >= 0x40000000 && leaf->fn <= RIP_REL_REF(cpuid_hyp_range_max)) || - (leaf->fn >= 0x80000000 && leaf->fn <= RIP_REL_REF(cpuid_ext_range_max)))) - return 0; - } - - return snp_cpuid_postprocess(ghcb, ctxt, leaf); -} - -/* - * Boot VC Handler - This is the first VC handler during boot, there is no GHCB - * page yet, so it only supports the MSR based communication with the - * hypervisor and only the CPUID exit-code. - */ -void __head do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) -{ - unsigned int subfn = lower_bits(regs->cx, 32); - unsigned int fn = lower_bits(regs->ax, 32); - u16 opcode = *(unsigned short *)regs->ip; - struct cpuid_leaf leaf; - int ret; - - /* Only CPUID is supported via MSR protocol */ - if (exit_code != SVM_EXIT_CPUID) - goto fail; - - /* Is it really a CPUID insn? */ - if (opcode != 0xa20f) - goto fail; - - leaf.fn = fn; - leaf.subfn = subfn; - - ret = snp_cpuid(NULL, NULL, &leaf); - if (!ret) - goto cpuid_done; - - if (ret != -EOPNOTSUPP) - goto fail; - - if (__sev_cpuid_hv_msr(&leaf)) - goto fail; - -cpuid_done: - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - - /* - * This is a VC handler and the #VC is only raised when SEV-ES is - * active, which means SEV must be active too. Do sanity checks on the - * CPUID results to make sure the hypervisor does not trick the kernel - * into the no-sev path. This could map sensitive data unencrypted and - * make it accessible to the hypervisor. - * - * In particular, check for: - * - Availability of CPUID leaf 0x8000001f - * - SEV CPUID bit. - * - * The hypervisor might still report the wrong C-bit position, but this - * can't be checked here. - */ - - if (fn == 0x80000000 && (regs->ax < 0x8000001f)) - /* SEV leaf check */ - goto fail; - else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) - /* SEV bit */ - goto fail; - - /* Skip over the CPUID two-byte opcode */ - regs->ip += 2; - - return; - -fail: - /* Terminate the guest */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt, - unsigned long address, - bool write) -{ - if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_USER; - ctxt->fi.cr2 = address; - if (write) - ctxt->fi.error_code |= X86_PF_WRITE; - - return ES_EXCEPTION; - } - - return ES_OK; -} - -static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, - void *src, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, b = backwards ? -1 : 1; - unsigned long address = (unsigned long)src; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, false); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *s = src + (i * data_size * b); - char *d = buf + (i * data_size); - - ret = vc_read_mem(ctxt, s, d, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt, - void *dst, char *buf, - unsigned int data_size, - unsigned int count, - bool backwards) -{ - int i, s = backwards ? -1 : 1; - unsigned long address = (unsigned long)dst; - enum es_result ret; - - ret = vc_insn_string_check(ctxt, address, true); - if (ret != ES_OK) - return ret; - - for (i = 0; i < count; i++) { - void *d = dst + (i * data_size * s); - char *b = buf + (i * data_size); - - ret = vc_write_mem(ctxt, d, b, data_size); - if (ret != ES_OK) - break; - } - - return ret; -} - -#define IOIO_TYPE_STR BIT(2) -#define IOIO_TYPE_IN 1 -#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR) -#define IOIO_TYPE_OUT 0 -#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR) - -#define IOIO_REP BIT(3) - -#define IOIO_ADDR_64 BIT(9) -#define IOIO_ADDR_32 BIT(8) -#define IOIO_ADDR_16 BIT(7) - -#define IOIO_DATA_32 BIT(6) -#define IOIO_DATA_16 BIT(5) -#define IOIO_DATA_8 BIT(4) - -#define IOIO_SEG_ES (0 << 10) -#define IOIO_SEG_DS (3 << 10) - -static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo) -{ - struct insn *insn = &ctxt->insn; - size_t size; - u64 port; - - *exitinfo = 0; - - switch (insn->opcode.bytes[0]) { - /* INS opcodes */ - case 0x6c: - case 0x6d: - *exitinfo |= IOIO_TYPE_INS; - *exitinfo |= IOIO_SEG_ES; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUTS opcodes */ - case 0x6e: - case 0x6f: - *exitinfo |= IOIO_TYPE_OUTS; - *exitinfo |= IOIO_SEG_DS; - port = ctxt->regs->dx & 0xffff; - break; - - /* IN immediate opcodes */ - case 0xe4: - case 0xe5: - *exitinfo |= IOIO_TYPE_IN; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* OUT immediate opcodes */ - case 0xe6: - case 0xe7: - *exitinfo |= IOIO_TYPE_OUT; - port = (u8)insn->immediate.value & 0xffff; - break; - - /* IN register opcodes */ - case 0xec: - case 0xed: - *exitinfo |= IOIO_TYPE_IN; - port = ctxt->regs->dx & 0xffff; - break; - - /* OUT register opcodes */ - case 0xee: - case 0xef: - *exitinfo |= IOIO_TYPE_OUT; - port = ctxt->regs->dx & 0xffff; - break; - - default: - return ES_DECODE_FAILED; - } - - *exitinfo |= port << 16; - - switch (insn->opcode.bytes[0]) { - case 0x6c: - case 0x6e: - case 0xe4: - case 0xe6: - case 0xec: - case 0xee: - /* Single byte opcodes */ - *exitinfo |= IOIO_DATA_8; - size = 1; - break; - default: - /* Length determined by instruction parsing */ - *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16 - : IOIO_DATA_32; - size = (insn->opnd_bytes == 2) ? 2 : 4; - } - - switch (insn->addr_bytes) { - case 2: - *exitinfo |= IOIO_ADDR_16; - break; - case 4: - *exitinfo |= IOIO_ADDR_32; - break; - case 8: - *exitinfo |= IOIO_ADDR_64; - break; - } - - if (insn_has_rep_prefix(insn)) - *exitinfo |= IOIO_REP; - - return vc_ioio_check(ctxt, (u16)port, size); -} - -static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u64 exit_info_1, exit_info_2; - enum es_result ret; - - ret = vc_ioio_exitinfo(ctxt, &exit_info_1); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_STR) { - - /* (REP) INS/OUTS */ - - bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF); - unsigned int io_bytes, exit_bytes; - unsigned int ghcb_count, op_count; - unsigned long es_base; - u64 sw_scratch; - - /* - * For the string variants with rep prefix the amount of in/out - * operations per #VC exception is limited so that the kernel - * has a chance to take interrupts and re-schedule while the - * instruction is emulated. - */ - io_bytes = (exit_info_1 >> 4) & 0x7; - ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes; - - op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1; - exit_info_2 = min(op_count, ghcb_count); - exit_bytes = exit_info_2 * io_bytes; - - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - /* Read bytes of OUTS into the shared buffer */ - if (!(exit_info_1 & IOIO_TYPE_IN)) { - ret = vc_insn_string_read(ctxt, - (void *)(es_base + regs->si), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - } - - /* - * Issue an VMGEXIT to the HV to consume the bytes from the - * shared buffer or to have it write them into the shared buffer - * depending on the instruction: OUTS or INS. - */ - sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer); - ghcb_set_sw_scratch(ghcb, sw_scratch); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, - exit_info_1, exit_info_2); - if (ret != ES_OK) - return ret; - - /* Read bytes from shared buffer into the guest's destination. */ - if (exit_info_1 & IOIO_TYPE_IN) { - ret = vc_insn_string_write(ctxt, - (void *)(es_base + regs->di), - ghcb->shared_buffer, io_bytes, - exit_info_2, df); - if (ret) - return ret; - - if (df) - regs->di -= exit_bytes; - else - regs->di += exit_bytes; - } else { - if (df) - regs->si -= exit_bytes; - else - regs->si += exit_bytes; - } - - if (exit_info_1 & IOIO_REP) - regs->cx -= exit_info_2; - - ret = regs->cx ? ES_RETRY : ES_OK; - - } else { - - /* IN/OUT into/from rAX */ - - int bits = (exit_info_1 & 0x70) >> 1; - u64 rax = 0; - - if (!(exit_info_1 & IOIO_TYPE_IN)) - rax = lower_bits(regs->ax, bits); - - ghcb_set_rax(ghcb, rax); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0); - if (ret != ES_OK) - return ret; - - if (exit_info_1 & IOIO_TYPE_IN) { - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - regs->ax = lower_bits(ghcb->save.rax, bits); - } - } - - return ret; -} - -static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - struct cpuid_leaf leaf; - int ret; - - leaf.fn = regs->ax; - leaf.subfn = regs->cx; - ret = snp_cpuid(ghcb, ctxt, &leaf); - if (!ret) { - regs->ax = leaf.eax; - regs->bx = leaf.ebx; - regs->cx = leaf.ecx; - regs->dx = leaf.edx; - } - - return ret; -} - -static enum es_result vc_handle_cpuid(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - u32 cr4 = native_read_cr4(); - enum es_result ret; - int snp_cpuid_ret; - - snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt); - if (!snp_cpuid_ret) - return ES_OK; - if (snp_cpuid_ret != -EOPNOTSUPP) - return ES_VMM_ERROR; - - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rcx(ghcb, regs->cx); - - if (cr4 & X86_CR4_OSXSAVE) - /* Safe to read xcr0 */ - ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK)); - else - /* xgetbv will cause #GP - use reset value for xcr0 */ - ghcb_set_xcr0(ghcb, 1); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && - ghcb_rbx_is_valid(ghcb) && - ghcb_rcx_is_valid(ghcb) && - ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - regs->ax = ghcb->save.rax; - regs->bx = ghcb->save.rbx; - regs->cx = ghcb->save.rcx; - regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, - struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - bool rdtscp = (exit_code == SVM_EXIT_RDTSCP); - enum es_result ret; - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) && - (!rdtscp || ghcb_rcx_is_valid(ghcb)))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - if (rdtscp) - ctxt->regs->cx = ghcb->save.rcx; - - return ES_OK; -} - -struct cc_setup_data { - struct setup_data header; - u32 cc_blob_address; -}; - -/* - * Search for a Confidential Computing blob passed in as a setup_data entry - * via the Linux Boot Protocol. - */ -static __head -struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) -{ - struct cc_setup_data *sd = NULL; - struct setup_data *hdr; - - hdr = (struct setup_data *)bp->hdr.setup_data; - - while (hdr) { - if (hdr->type == SETUP_CC_BLOB) { - sd = (struct cc_setup_data *)hdr; - return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; - } - hdr = (struct setup_data *)hdr->next; - } - - return NULL; -} - -/* - * Initialize the kernel's copy of the SNP CPUID table, and set up the - * pointer that will be used to access it. - * - * Maintaining a direct mapping of the SNP CPUID table used by firmware would - * be possible as an alternative, but the approach is brittle since the - * mapping needs to be updated in sync with all the changes to virtual memory - * layout and related mapping facilities throughout the boot process. - */ -static void __head setup_cpuid_table(const struct cc_blob_sev_info *cc_info) -{ - const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table; - int i; - - if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys; - if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID); - - cpuid_table = snp_cpuid_get_table(); - memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table)); - - /* Initialize CPUID ranges for range-checking. */ - for (i = 0; i < cpuid_table->count; i++) { - const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; - - if (fn->eax_in == 0x0) - RIP_REL_REF(cpuid_std_range_max) = fn->eax; - else if (fn->eax_in == 0x40000000) - RIP_REL_REF(cpuid_hyp_range_max) = fn->eax; - else if (fn->eax_in == 0x80000000) - RIP_REL_REF(cpuid_ext_range_max) = fn->eax; - } -} - -static void pvalidate_pages(struct snp_psc_desc *desc) -{ - struct psc_entry *e; - unsigned long vaddr; - unsigned int size; - unsigned int i; - bool validate; - int rc; - - for (i = 0; i <= desc->hdr.end_entry; i++) { - e = &desc->entries[i]; - - vaddr = (unsigned long)pfn_to_kaddr(e->gfn); - size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K; - validate = e->operation == SNP_PAGE_STATE_PRIVATE; - - rc = pvalidate(vaddr, size, validate); - if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) { - unsigned long vaddr_end = vaddr + PMD_SIZE; - - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) { - rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); - if (rc) - break; - } - } - - if (rc) { - WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc); - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); - } - } -} - -static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc) -{ - int cur_entry, end_entry, ret = 0; - struct snp_psc_desc *data; - struct es_em_ctxt ctxt; - - vc_ghcb_invalidate(ghcb); - - /* Copy the input desc into GHCB shared buffer */ - data = (struct snp_psc_desc *)ghcb->shared_buffer; - memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); - - /* - * As per the GHCB specification, the hypervisor can resume the guest - * before processing all the entries. Check whether all the entries - * are processed. If not, then keep retrying. Note, the hypervisor - * will update the data memory directly to indicate the status, so - * reference the data->hdr everywhere. - * - * The strategy here is to wait for the hypervisor to change the page - * state in the RMP table before guest accesses the memory pages. If the - * page state change was not successful, then later memory access will - * result in a crash. - */ - cur_entry = data->hdr.cur_entry; - end_entry = data->hdr.end_entry; - - while (data->hdr.cur_entry <= data->hdr.end_entry) { - ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); - - /* This will advance the shared buffer data points to. */ - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0); - - /* - * Page State Change VMGEXIT can pass error code through - * exit_info_2. - */ - if (WARN(ret || ghcb->save.sw_exit_info_2, - "SNP: PSC failed ret=%d exit_info_2=%llx\n", - ret, ghcb->save.sw_exit_info_2)) { - ret = 1; - goto out; - } - - /* Verify that reserved bit is not set */ - if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { - ret = 1; - goto out; - } - - /* - * Sanity check that entry processing is not going backwards. - * This will happen only if hypervisor is tricking us. - */ - if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, -"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", - end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { - ret = 1; - goto out; - } - } - -out: - return ret; -} - -static enum es_result vc_check_opcode_bytes(struct es_em_ctxt *ctxt, - unsigned long exit_code) -{ - unsigned int opcode = (unsigned int)ctxt->insn.opcode.value; - u8 modrm = ctxt->insn.modrm.value; - - switch (exit_code) { - - case SVM_EXIT_IOIO: - case SVM_EXIT_NPF: - /* handled separately */ - return ES_OK; - - case SVM_EXIT_CPUID: - if (opcode == 0xa20f) - return ES_OK; - break; - - case SVM_EXIT_INVD: - if (opcode == 0x080f) - return ES_OK; - break; - - case SVM_EXIT_MONITOR: - if (opcode == 0x010f && modrm == 0xc8) - return ES_OK; - break; - - case SVM_EXIT_MWAIT: - if (opcode == 0x010f && modrm == 0xc9) - return ES_OK; - break; - - case SVM_EXIT_MSR: - /* RDMSR */ - if (opcode == 0x320f || - /* WRMSR */ - opcode == 0x300f) - return ES_OK; - break; - - case SVM_EXIT_RDPMC: - if (opcode == 0x330f) - return ES_OK; - break; - - case SVM_EXIT_RDTSC: - if (opcode == 0x310f) - return ES_OK; - break; - - case SVM_EXIT_RDTSCP: - if (opcode == 0x010f && modrm == 0xf9) - return ES_OK; - break; - - case SVM_EXIT_READ_DR7: - if (opcode == 0x210f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_VMMCALL: - if (opcode == 0x010f && modrm == 0xd9) - return ES_OK; - - break; - - case SVM_EXIT_WRITE_DR7: - if (opcode == 0x230f && - X86_MODRM_REG(ctxt->insn.modrm.value) == 7) - return ES_OK; - break; - - case SVM_EXIT_WBINVD: - if (opcode == 0x90f) - return ES_OK; - break; - - default: - break; - } - - sev_printk(KERN_ERR "Wrong/unhandled opcode bytes: 0x%x, exit_code: 0x%lx, rIP: 0x%lx\n", - opcode, exit_code, ctxt->regs->ip); - - return ES_UNSUPPORTED; -} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c deleted file mode 100644 index b59b09c2f284..000000000000 --- a/arch/x86/kernel/sev.c +++ /dev/null @@ -1,2314 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * AMD Memory Encryption Support - * - * Copyright (C) 2019 SUSE - * - * Author: Joerg Roedel <jroedel@suse.de> - */ - -#define pr_fmt(fmt) "SEV: " fmt - -#include <linux/sched/debug.h> /* For show_regs() */ -#include <linux/percpu-defs.h> -#include <linux/cc_platform.h> -#include <linux/printk.h> -#include <linux/mm_types.h> -#include <linux/set_memory.h> -#include <linux/memblock.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/cpumask.h> -#include <linux/efi.h> -#include <linux/platform_device.h> -#include <linux/io.h> -#include <linux/psp-sev.h> -#include <uapi/linux/sev-guest.h> - -#include <asm/init.h> -#include <asm/cpu_entry_area.h> -#include <asm/stacktrace.h> -#include <asm/sev.h> -#include <asm/insn-eval.h> -#include <asm/fpu/xcr.h> -#include <asm/processor.h> -#include <asm/realmode.h> -#include <asm/setup.h> -#include <asm/traps.h> -#include <asm/svm.h> -#include <asm/smp.h> -#include <asm/cpu.h> -#include <asm/apic.h> -#include <asm/cpuid.h> -#include <asm/cmdline.h> - -#define DR7_RESET_VALUE 0x400 - -/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ -#define AP_INIT_CS_LIMIT 0xffff -#define AP_INIT_DS_LIMIT 0xffff -#define AP_INIT_LDTR_LIMIT 0xffff -#define AP_INIT_GDTR_LIMIT 0xffff -#define AP_INIT_IDTR_LIMIT 0xffff -#define AP_INIT_TR_LIMIT 0xffff -#define AP_INIT_RFLAGS_DEFAULT 0x2 -#define AP_INIT_DR6_DEFAULT 0xffff0ff0 -#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL -#define AP_INIT_XCR0_DEFAULT 0x1 -#define AP_INIT_X87_FTW_DEFAULT 0x5555 -#define AP_INIT_X87_FCW_DEFAULT 0x0040 -#define AP_INIT_CR0_DEFAULT 0x60000010 -#define AP_INIT_MXCSR_DEFAULT 0x1f80 - -static const char * const sev_status_feat_names[] = { - [MSR_AMD64_SEV_ENABLED_BIT] = "SEV", - [MSR_AMD64_SEV_ES_ENABLED_BIT] = "SEV-ES", - [MSR_AMD64_SEV_SNP_ENABLED_BIT] = "SEV-SNP", - [MSR_AMD64_SNP_VTOM_BIT] = "vTom", - [MSR_AMD64_SNP_REFLECT_VC_BIT] = "ReflectVC", - [MSR_AMD64_SNP_RESTRICTED_INJ_BIT] = "RI", - [MSR_AMD64_SNP_ALT_INJ_BIT] = "AI", - [MSR_AMD64_SNP_DEBUG_SWAP_BIT] = "DebugSwap", - [MSR_AMD64_SNP_PREVENT_HOST_IBS_BIT] = "NoHostIBS", - [MSR_AMD64_SNP_BTB_ISOLATION_BIT] = "BTBIsol", - [MSR_AMD64_SNP_VMPL_SSS_BIT] = "VmplSSS", - [MSR_AMD64_SNP_SECURE_TSC_BIT] = "SecureTSC", - [MSR_AMD64_SNP_VMGEXIT_PARAM_BIT] = "VMGExitParam", - [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", - [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", - [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", -}; - -/* For early boot hypervisor communication in SEV-ES enabled guests */ -static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); - -/* - * Needs to be in the .data section because we need it NULL before bss is - * cleared - */ -static struct ghcb *boot_ghcb __section(".data"); - -/* Bitmap of SEV features supported by the hypervisor */ -static u64 sev_hv_features __ro_after_init; - -/* #VC handler runtime per-CPU data */ -struct sev_es_runtime_data { - struct ghcb ghcb_page; - - /* - * Reserve one page per CPU as backup storage for the unencrypted GHCB. - * It is needed when an NMI happens while the #VC handler uses the real - * GHCB, and the NMI handler itself is causing another #VC exception. In - * that case the GHCB content of the first handler needs to be backed up - * and restored. - */ - struct ghcb backup_ghcb; - - /* - * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions. - * There is no need for it to be atomic, because nothing is written to - * the GHCB between the read and the write of ghcb_active. So it is safe - * to use it when a nested #VC exception happens before the write. - * - * This is necessary for example in the #VC->NMI->#VC case when the NMI - * happens while the first #VC handler uses the GHCB. When the NMI code - * raises a second #VC handler it might overwrite the contents of the - * GHCB written by the first handler. To avoid this the content of the - * GHCB is saved and restored when the GHCB is detected to be in use - * already. - */ - bool ghcb_active; - bool backup_ghcb_active; - - /* - * Cached DR7 value - write it on DR7 writes and return it on reads. - * That value will never make it to the real hardware DR7 as debugging - * is currently unsupported in SEV-ES guests. - */ - unsigned long dr7; -}; - -struct ghcb_state { - struct ghcb *ghcb; -}; - -static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); -static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); - -struct sev_config { - __u64 debug : 1, - - /* - * A flag used by __set_pages_state() that indicates when the - * per-CPU GHCB has been created and registered and thus can be - * used by the BSP instead of the early boot GHCB. - * - * For APs, the per-CPU GHCB is created before they are started - * and registered upon startup, so this flag can be used globally - * for the BSP and APs. - */ - ghcbs_initialized : 1, - - __reserved : 62; -}; - -static struct sev_config sev_cfg __read_mostly; - -static __always_inline bool on_vc_stack(struct pt_regs *regs) -{ - unsigned long sp = regs->sp; - - /* User-mode RSP is not trusted */ - if (user_mode(regs)) - return false; - - /* SYSCALL gap still has user-mode RSP */ - if (ip_within_syscall_gap(regs)) - return false; - - return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC))); -} - -/* - * This function handles the case when an NMI is raised in the #VC - * exception handler entry code, before the #VC handler has switched off - * its IST stack. In this case, the IST entry for #VC must be adjusted, - * so that any nested #VC exception will not overwrite the stack - * contents of the interrupted #VC handler. - * - * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a - * nested sev_es_ist_exit() call may adjust back the IST entry too - * early. - * - * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run - * on the NMI IST stack, as they are only called from NMI handling code - * right now. - */ -void noinstr __sev_es_ist_enter(struct pt_regs *regs) -{ - unsigned long old_ist, new_ist; - - /* Read old IST entry */ - new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - /* - * If NMI happened while on the #VC IST stack, set the new IST - * value below regs->sp, so that the interrupted stack frame is - * not overwritten by subsequent #VC exceptions. - */ - if (on_vc_stack(regs)) - new_ist = regs->sp; - - /* - * Reserve additional 8 bytes and store old IST value so this - * adjustment can be unrolled in __sev_es_ist_exit(). - */ - new_ist -= sizeof(old_ist); - *(unsigned long *)new_ist = old_ist; - - /* Set new IST entry */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist); -} - -void noinstr __sev_es_ist_exit(void) -{ - unsigned long ist; - - /* Read IST entry */ - ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - - if (WARN_ON(ist == __this_cpu_ist_top_va(VC))) - return; - - /* Read back old IST entry and write it to the TSS */ - this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist); -} - -/* - * Nothing shall interrupt this code path while holding the per-CPU - * GHCB. The backup GHCB is only for NMIs interrupting this path. - * - * Callers must disable local interrupts around it. - */ -static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (unlikely(data->ghcb_active)) { - /* GHCB is already in use - save its contents */ - - if (unlikely(data->backup_ghcb_active)) { - /* - * Backup-GHCB is also already in use. There is no way - * to continue here so just kill the machine. To make - * panic() work, mark GHCBs inactive so that messages - * can be printed out. - */ - data->ghcb_active = false; - data->backup_ghcb_active = false; - - instrumentation_begin(); - panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use"); - instrumentation_end(); - } - - /* Mark backup_ghcb active before writing to it */ - data->backup_ghcb_active = true; - - state->ghcb = &data->backup_ghcb; - - /* Backup GHCB content */ - *state->ghcb = *ghcb; - } else { - state->ghcb = NULL; - data->ghcb_active = true; - } - - return ghcb; -} - -static inline u64 sev_es_rd_ghcb_msr(void) -{ - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); -} - -static __always_inline void sev_es_wr_ghcb_msr(u64 val) -{ - u32 low, high; - - low = (u32)(val); - high = (u32)(val >> 32); - - native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high); -} - -static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt, - unsigned char *buffer) -{ - return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE); -} - -static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int insn_bytes; - - insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer); - if (insn_bytes == 0) { - /* Nothing could be copied */ - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } else if (insn_bytes == -EINVAL) { - /* Effective RIP could not be calculated */ - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - ctxt->fi.cr2 = 0; - return ES_EXCEPTION; - } - - if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes)) - return ES_DECODE_FAILED; - - if (ctxt->insn.immediate.got) - return ES_OK; - else - return ES_DECODE_FAILED; -} - -static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt) -{ - char buffer[MAX_INSN_SIZE]; - int res, ret; - - res = vc_fetch_insn_kernel(ctxt, buffer); - if (res) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = X86_PF_INSTR; - ctxt->fi.cr2 = ctxt->regs->ip; - return ES_EXCEPTION; - } - - ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64); - if (ret < 0) - return ES_DECODE_FAILED; - else - return ES_OK; -} - -static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) -{ - if (user_mode(ctxt->regs)) - return __vc_decode_user_insn(ctxt); - else - return __vc_decode_kern_insn(ctxt); -} - -static enum es_result vc_write_mem(struct es_em_ctxt *ctxt, - char *dst, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT | X86_PF_WRITE; - - /* - * This function uses __put_user() independent of whether kernel or user - * memory is accessed. This works fine because __put_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __put_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_to_user() here because - * vc_write_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *target = (u8 __user *)dst; - - memcpy(&d1, buf, 1); - if (__put_user(d1, target)) - goto fault; - break; - } - case 2: { - u16 d2; - u16 __user *target = (u16 __user *)dst; - - memcpy(&d2, buf, 2); - if (__put_user(d2, target)) - goto fault; - break; - } - case 4: { - u32 d4; - u32 __user *target = (u32 __user *)dst; - - memcpy(&d4, buf, 4); - if (__put_user(d4, target)) - goto fault; - break; - } - case 8: { - u64 d8; - u64 __user *target = (u64 __user *)dst; - - memcpy(&d8, buf, 8); - if (__put_user(d8, target)) - goto fault; - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)dst; - - return ES_EXCEPTION; -} - -static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, - char *src, char *buf, size_t size) -{ - unsigned long error_code = X86_PF_PROT; - - /* - * This function uses __get_user() independent of whether kernel or user - * memory is accessed. This works fine because __get_user() does no - * sanity checks of the pointer being accessed. All that it does is - * to report when the access failed. - * - * Also, this function runs in atomic context, so __get_user() is not - * allowed to sleep. The page-fault handler detects that it is running - * in atomic context and will not try to take mmap_sem and handle the - * fault, so additional pagefault_enable()/disable() calls are not - * needed. - * - * The access can't be done via copy_from_user() here because - * vc_read_mem() must not use string instructions to access unsafe - * memory. The reason is that MOVS is emulated by the #VC handler by - * splitting the move up into a read and a write and taking a nested #VC - * exception on whatever of them is the MMIO access. Using string - * instructions here would cause infinite nesting. - */ - switch (size) { - case 1: { - u8 d1; - u8 __user *s = (u8 __user *)src; - - if (__get_user(d1, s)) - goto fault; - memcpy(buf, &d1, 1); - break; - } - case 2: { - u16 d2; - u16 __user *s = (u16 __user *)src; - - if (__get_user(d2, s)) - goto fault; - memcpy(buf, &d2, 2); - break; - } - case 4: { - u32 d4; - u32 __user *s = (u32 __user *)src; - - if (__get_user(d4, s)) - goto fault; - memcpy(buf, &d4, 4); - break; - } - case 8: { - u64 d8; - u64 __user *s = (u64 __user *)src; - if (__get_user(d8, s)) - goto fault; - memcpy(buf, &d8, 8); - break; - } - default: - WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size); - return ES_UNSUPPORTED; - } - - return ES_OK; - -fault: - if (user_mode(ctxt->regs)) - error_code |= X86_PF_USER; - - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.error_code = error_code; - ctxt->fi.cr2 = (unsigned long)src; - - return ES_EXCEPTION; -} - -static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned long vaddr, phys_addr_t *paddr) -{ - unsigned long va = (unsigned long)vaddr; - unsigned int level; - phys_addr_t pa; - pgd_t *pgd; - pte_t *pte; - - pgd = __va(read_cr3_pa()); - pgd = &pgd[pgd_index(va)]; - pte = lookup_address_in_pgd(pgd, va, &level); - if (!pte) { - ctxt->fi.vector = X86_TRAP_PF; - ctxt->fi.cr2 = vaddr; - ctxt->fi.error_code = 0; - - if (user_mode(ctxt->regs)) - ctxt->fi.error_code |= X86_PF_USER; - - return ES_EXCEPTION; - } - - if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC)) - /* Emulated MMIO to/from encrypted memory not supported */ - return ES_UNSUPPORTED; - - pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; - pa |= va & ~page_level_mask(level); - - *paddr = pa; - - return ES_OK; -} - -static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size) -{ - BUG_ON(size > 4); - - if (user_mode(ctxt->regs)) { - struct thread_struct *t = ¤t->thread; - struct io_bitmap *iobm = t->io_bitmap; - size_t idx; - - if (!iobm) - goto fault; - - for (idx = port; idx < port + size; ++idx) { - if (test_bit(idx, iobm->bitmap)) - goto fault; - } - } - - return ES_OK; - -fault: - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - - return ES_EXCEPTION; -} - -/* Include code shared with pre-decompression boot stage */ -#include "sev-shared.c" - -static noinstr void __sev_put_ghcb(struct ghcb_state *state) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - WARN_ON(!irqs_disabled()); - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - if (state->ghcb) { - /* Restore GHCB from Backup */ - *ghcb = *state->ghcb; - data->backup_ghcb_active = false; - state->ghcb = NULL; - } else { - /* - * Invalidate the GHCB so a VMGEXIT instruction issued - * from userspace won't appear to be valid. - */ - vc_ghcb_invalidate(ghcb); - data->ghcb_active = false; - } -} - -void noinstr __sev_es_nmi_complete(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa_nodebug(ghcb)); - VMGEXIT(); - - __sev_put_ghcb(&state); -} - -static u64 __init get_secrets_page(void) -{ - u64 pa_data = boot_params.cc_blob_address; - struct cc_blob_sev_info info; - void *map; - - /* - * The CC blob contains the address of the secrets page, check if the - * blob is present. - */ - if (!pa_data) - return 0; - - map = early_memremap(pa_data, sizeof(info)); - if (!map) { - pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); - return 0; - } - memcpy(&info, map, sizeof(info)); - early_memunmap(map, sizeof(info)); - - /* smoke-test the secrets page passed */ - if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) - return 0; - - return info.secrets_phys; -} - -static u64 __init get_snp_jump_table_addr(void) -{ - struct snp_secrets_page_layout *layout; - void __iomem *mem; - u64 pa, addr; - - pa = get_secrets_page(); - if (!pa) - return 0; - - mem = ioremap_encrypted(pa, PAGE_SIZE); - if (!mem) { - pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); - return 0; - } - - layout = (__force struct snp_secrets_page_layout *)mem; - - addr = layout->os_area.ap_jump_table_pa; - iounmap(mem); - - return addr; -} - -static u64 __init get_jump_table_addr(void) -{ - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u64 ret = 0; - - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return get_snp_jump_table_addr(); - - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (ghcb_sw_exit_info_1_is_valid(ghcb) && - ghcb_sw_exit_info_2_is_valid(ghcb)) - ret = ghcb->save.sw_exit_info_2; - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - return ret; -} - -static void __head -early_set_pages_state(unsigned long vaddr, unsigned long paddr, - unsigned long npages, enum psc_op op) -{ - unsigned long paddr_end; - u64 val; - int ret; - - vaddr = vaddr & PAGE_MASK; - - paddr = paddr & PAGE_MASK; - paddr_end = paddr + (npages << PAGE_SHIFT); - - while (paddr < paddr_end) { - if (op == SNP_PAGE_STATE_SHARED) { - /* Page validation must be rescinded before changing to shared */ - ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false); - if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) - goto e_term; - } - - /* - * Use the MSR protocol because this function can be called before - * the GHCB is established. - */ - sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); - VMGEXIT(); - - val = sev_es_rd_ghcb_msr(); - - if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, - "Wrong PSC response code: 0x%x\n", - (unsigned int)GHCB_RESP_CODE(val))) - goto e_term; - - if (WARN(GHCB_MSR_PSC_RESP_VAL(val), - "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", - op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", - paddr, GHCB_MSR_PSC_RESP_VAL(val))) - goto e_term; - - if (op == SNP_PAGE_STATE_PRIVATE) { - /* Page validation must be performed after changing to private */ - ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true); - if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret)) - goto e_term; - } - - vaddr += PAGE_SIZE; - paddr += PAGE_SIZE; - } - - return; - -e_term: - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); -} - -void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* - * Ask the hypervisor to mark the memory pages as private in the RMP - * table. - */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, - unsigned long npages) -{ - /* - * This can be invoked in early boot while running identity mapped, so - * use an open coded check for SNP instead of using cc_platform_has(). - * This eliminates worries about jump tables or checking boot_cpu_data - * in the cc_platform_has() function. - */ - if (!(RIP_REL_REF(sev_status) & MSR_AMD64_SEV_SNP_ENABLED)) - return; - - /* Ask hypervisor to mark the memory pages shared in the RMP table. */ - early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED); -} - -void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) -{ - unsigned long vaddr, npages; - - vaddr = (unsigned long)__va(paddr); - npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; - - if (op == SNP_PAGE_STATE_PRIVATE) - early_snp_set_memory_private(vaddr, paddr, npages); - else if (op == SNP_PAGE_STATE_SHARED) - early_snp_set_memory_shared(vaddr, paddr, npages); - else - WARN(1, "invalid memory op %d\n", op); -} - -static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, - unsigned long vaddr_end, int op) -{ - struct ghcb_state state; - bool use_large_entry; - struct psc_hdr *hdr; - struct psc_entry *e; - unsigned long flags; - unsigned long pfn; - struct ghcb *ghcb; - int i; - - hdr = &data->hdr; - e = data->entries; - - memset(data, 0, sizeof(*data)); - i = 0; - - while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) { - hdr->end_entry = i; - - if (is_vmalloc_addr((void *)vaddr)) { - pfn = vmalloc_to_pfn((void *)vaddr); - use_large_entry = false; - } else { - pfn = __pa(vaddr) >> PAGE_SHIFT; - use_large_entry = true; - } - - e->gfn = pfn; - e->operation = op; - - if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) && - (vaddr_end - vaddr) >= PMD_SIZE) { - e->pagesize = RMP_PG_SIZE_2M; - vaddr += PMD_SIZE; - } else { - e->pagesize = RMP_PG_SIZE_4K; - vaddr += PAGE_SIZE; - } - - e++; - i++; - } - - /* Page validation must be rescinded before changing to shared */ - if (op == SNP_PAGE_STATE_SHARED) - pvalidate_pages(data); - - local_irq_save(flags); - - if (sev_cfg.ghcbs_initialized) - ghcb = __sev_get_ghcb(&state); - else - ghcb = boot_ghcb; - - /* Invoke the hypervisor to perform the page state changes */ - if (!ghcb || vmgexit_psc(ghcb, data)) - sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); - - if (sev_cfg.ghcbs_initialized) - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Page validation must be performed after changing to private */ - if (op == SNP_PAGE_STATE_PRIVATE) - pvalidate_pages(data); - - return vaddr; -} - -static void set_pages_state(unsigned long vaddr, unsigned long npages, int op) -{ - struct snp_psc_desc desc; - unsigned long vaddr_end; - - /* Use the MSR protocol when a GHCB is not available. */ - if (!boot_ghcb) - return early_set_pages_state(vaddr, __pa(vaddr), npages, op); - - vaddr = vaddr & PAGE_MASK; - vaddr_end = vaddr + (npages << PAGE_SHIFT); - - while (vaddr < vaddr_end) - vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op); -} - -void snp_set_memory_shared(unsigned long vaddr, unsigned long npages) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); -} - -void snp_set_memory_private(unsigned long vaddr, unsigned long npages) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -void snp_accept_memory(phys_addr_t start, phys_addr_t end) -{ - unsigned long vaddr, npages; - - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - vaddr = (unsigned long)__va(start); - npages = (end - start) >> PAGE_SHIFT; - - set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); -} - -static int snp_set_vmsa(void *va, bool vmsa) -{ - u64 attrs; - - /* - * Running at VMPL0 allows the kernel to change the VMSA bit for a page - * using the RMPADJUST instruction. However, for the instruction to - * succeed it must target the permissions of a lesser privileged - * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST - * instruction in the AMD64 APM Volume 3). - */ - attrs = 1; - if (vmsa) - attrs |= RMPADJUST_VMSA_PAGE_BIT; - - return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); -} - -#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) -#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) -#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) - -#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) -#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) - -static void *snp_alloc_vmsa_page(void) -{ - struct page *p; - - /* - * Allocate VMSA page to work around the SNP erratum where the CPU will - * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) - * collides with the RMP entry of VMSA page. The recommended workaround - * is to not use a large page. - * - * Allocate an 8k page which is also 8k-aligned. - */ - p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); - if (!p) - return NULL; - - split_page(p, 1); - - /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ - __free_page(p); - - return page_address(p + 1); -} - -static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) -{ - int err; - - err = snp_set_vmsa(vmsa, false); - if (err) - pr_err("clear VMSA page failed (%u), leaking page\n", err); - else - free_page((unsigned long)vmsa); -} - -static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip) -{ - struct sev_es_save_area *cur_vmsa, *vmsa; - struct ghcb_state state; - unsigned long flags; - struct ghcb *ghcb; - u8 sipi_vector; - int cpu, ret; - u64 cr4; - - /* - * The hypervisor SNP feature support check has happened earlier, just check - * the AP_CREATION one here. - */ - if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) - return -EOPNOTSUPP; - - /* - * Verify the desired start IP against the known trampoline start IP - * to catch any future new trampolines that may be introduced that - * would require a new protected guest entry point. - */ - if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, - "Unsupported SNP start_ip: %lx\n", start_ip)) - return -EINVAL; - - /* Override start_ip with known protected guest start IP */ - start_ip = real_mode_header->sev_es_trampoline_start; - - /* Find the logical CPU for the APIC ID */ - for_each_present_cpu(cpu) { - if (arch_match_cpu_phys_id(cpu, apic_id)) - break; - } - if (cpu >= nr_cpu_ids) - return -EINVAL; - - cur_vmsa = per_cpu(sev_vmsa, cpu); - - /* - * A new VMSA is created each time because there is no guarantee that - * the current VMSA is the kernels or that the vCPU is not running. If - * an attempt was done to use the current VMSA with a running vCPU, a - * #VMEXIT of that vCPU would wipe out all of the settings being done - * here. - */ - vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); - if (!vmsa) - return -ENOMEM; - - /* CR4 should maintain the MCE value */ - cr4 = native_read_cr4() & X86_CR4_MCE; - - /* Set the CS value based on the start_ip converted to a SIPI vector */ - sipi_vector = (start_ip >> 12); - vmsa->cs.base = sipi_vector << 12; - vmsa->cs.limit = AP_INIT_CS_LIMIT; - vmsa->cs.attrib = INIT_CS_ATTRIBS; - vmsa->cs.selector = sipi_vector << 8; - - /* Set the RIP value based on start_ip */ - vmsa->rip = start_ip & 0xfff; - - /* Set AP INIT defaults as documented in the APM */ - vmsa->ds.limit = AP_INIT_DS_LIMIT; - vmsa->ds.attrib = INIT_DS_ATTRIBS; - vmsa->es = vmsa->ds; - vmsa->fs = vmsa->ds; - vmsa->gs = vmsa->ds; - vmsa->ss = vmsa->ds; - - vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; - vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; - vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; - vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; - vmsa->tr.limit = AP_INIT_TR_LIMIT; - vmsa->tr.attrib = INIT_TR_ATTRIBS; - - vmsa->cr4 = cr4; - vmsa->cr0 = AP_INIT_CR0_DEFAULT; - vmsa->dr7 = DR7_RESET_VALUE; - vmsa->dr6 = AP_INIT_DR6_DEFAULT; - vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; - vmsa->g_pat = AP_INIT_GPAT_DEFAULT; - vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; - vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; - vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; - vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; - - /* SVME must be set. */ - vmsa->efer = EFER_SVME; - - /* - * Set the SNP-specific fields for this VMSA: - * VMPL level - * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) - */ - vmsa->vmpl = 0; - vmsa->sev_features = sev_status >> 2; - - /* Switch the page over to a VMSA page now that it is initialized */ - ret = snp_set_vmsa(vmsa, true); - if (ret) { - pr_err("set VMSA page failed (%u)\n", ret); - free_page((unsigned long)vmsa); - - return -EINVAL; - } - - /* Issue VMGEXIT AP Creation NAE event */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - ghcb_set_rax(ghcb, vmsa->sev_features); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); - ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); - ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - if (!ghcb_sw_exit_info_1_is_valid(ghcb) || - lower_32_bits(ghcb->save.sw_exit_info_1)) { - pr_err("SNP AP Creation error\n"); - ret = -EINVAL; - } - - __sev_put_ghcb(&state); - - local_irq_restore(flags); - - /* Perform cleanup if there was an error */ - if (ret) { - snp_cleanup_vmsa(vmsa); - vmsa = NULL; - } - - /* Free up any previous VMSA page */ - if (cur_vmsa) - snp_cleanup_vmsa(cur_vmsa); - - /* Record the current VMSA page */ - per_cpu(sev_vmsa, cpu) = vmsa; - - return ret; -} - -void __init snp_set_wakeup_secondary_cpu(void) -{ - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return; - - /* - * Always set this override if SNP is enabled. This makes it the - * required method to start APs under SNP. If the hypervisor does - * not support AP creation, then no APs will be started. - */ - apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit); -} - -int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh) -{ - u16 startup_cs, startup_ip; - phys_addr_t jump_table_pa; - u64 jump_table_addr; - u16 __iomem *jump_table; - - jump_table_addr = get_jump_table_addr(); - - /* On UP guests there is no jump table so this is not a failure */ - if (!jump_table_addr) - return 0; - - /* Check if AP Jump Table is page-aligned */ - if (jump_table_addr & ~PAGE_MASK) - return -EINVAL; - - jump_table_pa = jump_table_addr & PAGE_MASK; - - startup_cs = (u16)(rmh->trampoline_start >> 4); - startup_ip = (u16)(rmh->sev_es_trampoline_start - - rmh->trampoline_start); - - jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE); - if (!jump_table) - return -EIO; - - writew(startup_ip, &jump_table[0]); - writew(startup_cs, &jump_table[1]); - - iounmap(jump_table); - - return 0; -} - -/* - * This is needed by the OVMF UEFI firmware which will use whatever it finds in - * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu - * runtime GHCBs used by the kernel are also mapped in the EFI page-table. - */ -int __init sev_es_efi_map_ghcbs(pgd_t *pgd) -{ - struct sev_es_runtime_data *data; - unsigned long address, pflags; - int cpu; - u64 pfn; - - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return 0; - - pflags = _PAGE_NX | _PAGE_RW; - - for_each_possible_cpu(cpu) { - data = per_cpu(runtime_data, cpu); - - address = __pa(&data->ghcb_page); - pfn = address >> PAGE_SHIFT; - - if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags)) - return 1; - } - - return 0; -} - -static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct pt_regs *regs = ctxt->regs; - enum es_result ret; - u64 exit_info_1; - - /* Is it a WRMSR? */ - exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0; - - ghcb_set_rcx(ghcb, regs->cx); - if (exit_info_1) { - ghcb_set_rax(ghcb, regs->ax); - ghcb_set_rdx(ghcb, regs->dx); - } - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0); - - if ((ret == ES_OK) && (!exit_info_1)) { - regs->ax = ghcb->save.rax; - regs->dx = ghcb->save.rdx; - } - - return ret; -} - -static void snp_register_per_cpu_ghcb(void) -{ - struct sev_es_runtime_data *data; - struct ghcb *ghcb; - - data = this_cpu_read(runtime_data); - ghcb = &data->ghcb_page; - - snp_register_ghcb_early(__pa(ghcb)); -} - -void setup_ghcb(void) -{ - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return; - - /* - * Check whether the runtime #VC exception handler is active. It uses - * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). - * - * If SNP is active, register the per-CPU GHCB page so that the runtime - * exception handler can use it. - */ - if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - snp_register_per_cpu_ghcb(); - - sev_cfg.ghcbs_initialized = true; - - return; - } - - /* - * Make sure the hypervisor talks a supported protocol. - * This gets called only in the BSP boot phase. - */ - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* - * Clear the boot_ghcb. The first exception comes in before the bss - * section is cleared. - */ - memset(&boot_ghcb_page, 0, PAGE_SIZE); - - /* Alright - Make the boot-ghcb public */ - boot_ghcb = &boot_ghcb_page; - - /* SNP guest requires that GHCB GPA must be registered. */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - snp_register_ghcb_early(__pa(&boot_ghcb_page)); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void sev_es_ap_hlt_loop(void) -{ - struct ghcb_state state; - struct ghcb *ghcb; - - ghcb = __sev_get_ghcb(&state); - - while (true) { - vc_ghcb_invalidate(ghcb); - ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP); - ghcb_set_sw_exit_info_1(ghcb, 0); - ghcb_set_sw_exit_info_2(ghcb, 0); - - sev_es_wr_ghcb_msr(__pa(ghcb)); - VMGEXIT(); - - /* Wakeup signal? */ - if (ghcb_sw_exit_info_2_is_valid(ghcb) && - ghcb->save.sw_exit_info_2) - break; - } - - __sev_put_ghcb(&state); -} - -/* - * Play_dead handler when running under SEV-ES. This is needed because - * the hypervisor can't deliver an SIPI request to restart the AP. - * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the - * hypervisor wakes it up again. - */ -static void sev_es_play_dead(void) -{ - play_dead_common(); - - /* IRQs now disabled */ - - sev_es_ap_hlt_loop(); - - /* - * If we get here, the VCPU was woken up again. Jump to CPU - * startup code to get it back online. - */ - soft_restart_cpu(); -} -#else /* CONFIG_HOTPLUG_CPU */ -#define sev_es_play_dead native_play_dead -#endif /* CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_SMP -static void __init sev_es_setup_play_dead(void) -{ - smp_ops.play_dead = sev_es_play_dead; -} -#else -static inline void sev_es_setup_play_dead(void) { } -#endif - -static void __init alloc_runtime_data(int cpu) -{ - struct sev_es_runtime_data *data; - - data = memblock_alloc(sizeof(*data), PAGE_SIZE); - if (!data) - panic("Can't allocate SEV-ES runtime data"); - - per_cpu(runtime_data, cpu) = data; -} - -static void __init init_ghcb(int cpu) -{ - struct sev_es_runtime_data *data; - int err; - - data = per_cpu(runtime_data, cpu); - - err = early_set_memory_decrypted((unsigned long)&data->ghcb_page, - sizeof(data->ghcb_page)); - if (err) - panic("Can't map GHCBs unencrypted"); - - memset(&data->ghcb_page, 0, sizeof(data->ghcb_page)); - - data->ghcb_active = false; - data->backup_ghcb_active = false; -} - -void __init sev_es_init_vc_handling(void) -{ - int cpu; - - BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE); - - if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) - return; - - if (!sev_es_check_cpu_features()) - panic("SEV-ES CPU Features missing"); - - /* - * SNP is supported in v2 of the GHCB spec which mandates support for HV - * features. - */ - if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { - sev_hv_features = get_hv_features(); - - if (!(sev_hv_features & GHCB_HV_FT_SNP)) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); - } - - /* Initialize per-cpu GHCB pages */ - for_each_possible_cpu(cpu) { - alloc_runtime_data(cpu); - init_ghcb(cpu); - } - - sev_es_setup_play_dead(); - - /* Secondary CPUs use the runtime #VC handler */ - initial_vc_handler = (unsigned long)kernel_exc_vmm_communication; -} - -static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt) -{ - int trapnr = ctxt->fi.vector; - - if (trapnr == X86_TRAP_PF) - native_write_cr2(ctxt->fi.cr2); - - ctxt->regs->orig_ax = ctxt->fi.error_code; - do_early_exception(ctxt->regs, trapnr); -} - -static long *vc_insn_get_rm(struct es_em_ctxt *ctxt) -{ - long *reg_array; - int offset; - - reg_array = (long *)ctxt->regs; - offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs); - - if (offset < 0) - return NULL; - - offset /= sizeof(long); - - return reg_array + offset; -} -static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt, - unsigned int bytes, bool read) -{ - u64 exit_code, exit_info_1, exit_info_2; - unsigned long ghcb_pa = __pa(ghcb); - enum es_result res; - phys_addr_t paddr; - void __user *ref; - - ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs); - if (ref == (void __user *)-1L) - return ES_UNSUPPORTED; - - exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE; - - res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr); - if (res != ES_OK) { - if (res == ES_EXCEPTION && !read) - ctxt->fi.error_code |= X86_PF_WRITE; - - return res; - } - - exit_info_1 = paddr; - /* Can never be greater than 8 */ - exit_info_2 = bytes; - - ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer)); - - return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2); -} - -/* - * The MOVS instruction has two memory operands, which raises the - * problem that it is not known whether the access to the source or the - * destination caused the #VC exception (and hence whether an MMIO read - * or write operation needs to be emulated). - * - * Instead of playing games with walking page-tables and trying to guess - * whether the source or destination is an MMIO range, split the move - * into two operations, a read and a write with only one memory operand. - * This will cause a nested #VC exception on the MMIO address which can - * then be handled. - * - * This implementation has the benefit that it also supports MOVS where - * source _and_ destination are MMIO regions. - * - * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a - * rare operation. If it turns out to be a performance problem the split - * operations can be moved to memcpy_fromio() and memcpy_toio(). - */ -static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt, - unsigned int bytes) -{ - unsigned long ds_base, es_base; - unsigned char *src, *dst; - unsigned char buffer[8]; - enum es_result ret; - bool rep; - int off; - - ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS); - es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES); - - if (ds_base == -1L || es_base == -1L) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - src = ds_base + (unsigned char *)ctxt->regs->si; - dst = es_base + (unsigned char *)ctxt->regs->di; - - ret = vc_read_mem(ctxt, src, buffer, bytes); - if (ret != ES_OK) - return ret; - - ret = vc_write_mem(ctxt, dst, buffer, bytes); - if (ret != ES_OK) - return ret; - - if (ctxt->regs->flags & X86_EFLAGS_DF) - off = -bytes; - else - off = bytes; - - ctxt->regs->si += off; - ctxt->regs->di += off; - - rep = insn_has_rep_prefix(&ctxt->insn); - if (rep) - ctxt->regs->cx -= 1; - - if (!rep || ctxt->regs->cx == 0) - return ES_OK; - else - return ES_RETRY; -} - -static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - struct insn *insn = &ctxt->insn; - enum insn_mmio_type mmio; - unsigned int bytes = 0; - enum es_result ret; - u8 sign_byte; - long *reg_data; - - mmio = insn_decode_mmio(insn, &bytes); - if (mmio == INSN_MMIO_DECODE_FAILED) - return ES_DECODE_FAILED; - - if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) { - reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs); - if (!reg_data) - return ES_DECODE_FAILED; - } - - if (user_mode(ctxt->regs)) - return ES_UNSUPPORTED; - - switch (mmio) { - case INSN_MMIO_WRITE: - memcpy(ghcb->shared_buffer, reg_data, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_WRITE_IMM: - memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes); - ret = vc_do_mmio(ghcb, ctxt, bytes, false); - break; - case INSN_MMIO_READ: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero-extend for 32-bit operation */ - if (bytes == 4) - *reg_data = 0; - - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_ZERO_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - /* Zero extend based on operand size */ - memset(reg_data, 0, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_READ_SIGN_EXTEND: - ret = vc_do_mmio(ghcb, ctxt, bytes, true); - if (ret) - break; - - if (bytes == 1) { - u8 *val = (u8 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x80) ? 0xff : 0x00; - } else { - u16 *val = (u16 *)ghcb->shared_buffer; - - sign_byte = (*val & 0x8000) ? 0xff : 0x00; - } - - /* Sign extend based on operand size */ - memset(reg_data, sign_byte, insn->opnd_bytes); - memcpy(reg_data, ghcb->shared_buffer, bytes); - break; - case INSN_MMIO_MOVS: - ret = vc_handle_mmio_movs(ctxt, bytes); - break; - default: - ret = ES_UNSUPPORTED; - break; - } - - return ret; -} - -static enum es_result vc_handle_dr7_write(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long val, *reg = vc_insn_get_rm(ctxt); - enum es_result ret; - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - val = *reg; - - /* Upper 32 bits must be written as zeroes */ - if (val >> 32) { - ctxt->fi.vector = X86_TRAP_GP; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; - } - - /* Clear out other reserved bits and set bit 10 */ - val = (val & 0xffff23ffL) | BIT(10); - - /* Early non-zero writes to DR7 are not supported */ - if (!data && (val & ~DR7_RESET_VALUE)) - return ES_UNSUPPORTED; - - /* Using a value of 0 for ExitInfo1 means RAX holds the value */ - ghcb_set_rax(ghcb, val); - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0); - if (ret != ES_OK) - return ret; - - if (data) - data->dr7 = val; - - return ES_OK; -} - -static enum es_result vc_handle_dr7_read(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - struct sev_es_runtime_data *data = this_cpu_read(runtime_data); - long *reg = vc_insn_get_rm(ctxt); - - if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP) - return ES_VMM_ERROR; - - if (!reg) - return ES_DECODE_FAILED; - - if (data) - *reg = data->dr7; - else - *reg = DR7_RESET_VALUE; - - return ES_OK; -} - -static enum es_result vc_handle_wbinvd(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0); -} - -static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rcx(ghcb, ctxt->regs->cx); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0); - if (ret != ES_OK) - return ret; - - if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb))) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - ctxt->regs->dx = ghcb->save.rdx; - - return ES_OK; -} - -static enum es_result vc_handle_monitor(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Treat it as a NOP and do not leak a physical address to the - * hypervisor. - */ - return ES_OK; -} - -static enum es_result vc_handle_mwait(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* Treat the same as MONITOR/MONITORX */ - return ES_OK; -} - -static enum es_result vc_handle_vmmcall(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - enum es_result ret; - - ghcb_set_rax(ghcb, ctxt->regs->ax); - ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0); - - if (x86_platform.hyper.sev_es_hcall_prepare) - x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs); - - ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0); - if (ret != ES_OK) - return ret; - - if (!ghcb_rax_is_valid(ghcb)) - return ES_VMM_ERROR; - - ctxt->regs->ax = ghcb->save.rax; - - /* - * Call sev_es_hcall_finish() after regs->ax is already set. - * This allows the hypervisor handler to overwrite it again if - * necessary. - */ - if (x86_platform.hyper.sev_es_hcall_finish && - !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs)) - return ES_VMM_ERROR; - - return ES_OK; -} - -static enum es_result vc_handle_trap_ac(struct ghcb *ghcb, - struct es_em_ctxt *ctxt) -{ - /* - * Calling ecx_alignment_check() directly does not work, because it - * enables IRQs and the GHCB is active. Forward the exception and call - * it later from vc_forward_exception(). - */ - ctxt->fi.vector = X86_TRAP_AC; - ctxt->fi.error_code = 0; - return ES_EXCEPTION; -} - -static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt, - struct ghcb *ghcb, - unsigned long exit_code) -{ - enum es_result result = vc_check_opcode_bytes(ctxt, exit_code); - - if (result != ES_OK) - return result; - - switch (exit_code) { - case SVM_EXIT_READ_DR7: - result = vc_handle_dr7_read(ghcb, ctxt); - break; - case SVM_EXIT_WRITE_DR7: - result = vc_handle_dr7_write(ghcb, ctxt); - break; - case SVM_EXIT_EXCP_BASE + X86_TRAP_AC: - result = vc_handle_trap_ac(ghcb, ctxt); - break; - case SVM_EXIT_RDTSC: - case SVM_EXIT_RDTSCP: - result = vc_handle_rdtsc(ghcb, ctxt, exit_code); - break; - case SVM_EXIT_RDPMC: - result = vc_handle_rdpmc(ghcb, ctxt); - break; - case SVM_EXIT_INVD: - pr_err_ratelimited("#VC exception for INVD??? Seriously???\n"); - result = ES_UNSUPPORTED; - break; - case SVM_EXIT_CPUID: - result = vc_handle_cpuid(ghcb, ctxt); - break; - case SVM_EXIT_IOIO: - result = vc_handle_ioio(ghcb, ctxt); - break; - case SVM_EXIT_MSR: - result = vc_handle_msr(ghcb, ctxt); - break; - case SVM_EXIT_VMMCALL: - result = vc_handle_vmmcall(ghcb, ctxt); - break; - case SVM_EXIT_WBINVD: - result = vc_handle_wbinvd(ghcb, ctxt); - break; - case SVM_EXIT_MONITOR: - result = vc_handle_monitor(ghcb, ctxt); - break; - case SVM_EXIT_MWAIT: - result = vc_handle_mwait(ghcb, ctxt); - break; - case SVM_EXIT_NPF: - result = vc_handle_mmio(ghcb, ctxt); - break; - default: - /* - * Unexpected #VC exception - */ - result = ES_UNSUPPORTED; - } - - return result; -} - -static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt) -{ - long error_code = ctxt->fi.error_code; - int trapnr = ctxt->fi.vector; - - ctxt->regs->orig_ax = ctxt->fi.error_code; - - switch (trapnr) { - case X86_TRAP_GP: - exc_general_protection(ctxt->regs, error_code); - break; - case X86_TRAP_UD: - exc_invalid_op(ctxt->regs); - break; - case X86_TRAP_PF: - write_cr2(ctxt->fi.cr2); - exc_page_fault(ctxt->regs, error_code); - break; - case X86_TRAP_AC: - exc_alignment_check(ctxt->regs, error_code); - break; - default: - pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n"); - BUG(); - } -} - -static __always_inline bool is_vc2_stack(unsigned long sp) -{ - return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2)); -} - -static __always_inline bool vc_from_invalid_context(struct pt_regs *regs) -{ - unsigned long sp, prev_sp; - - sp = (unsigned long)regs; - prev_sp = regs->sp; - - /* - * If the code was already executing on the VC2 stack when the #VC - * happened, let it proceed to the normal handling routine. This way the - * code executing on the VC2 stack can cause #VC exceptions to get handled. - */ - return is_vc2_stack(sp) && !is_vc2_stack(prev_sp); -} - -static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - enum es_result result; - struct ghcb *ghcb; - bool ret = true; - - ghcb = __sev_get_ghcb(&state); - - vc_ghcb_invalidate(ghcb); - result = vc_init_em_ctxt(&ctxt, regs, error_code); - - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, ghcb, error_code); - - __sev_put_ghcb(&state); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_VMM_ERROR: - pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_DECODE_FAILED: - pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - error_code, regs->ip); - ret = false; - break; - case ES_EXCEPTION: - vc_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - pr_emerg("Unknown result in %s():%d\n", __func__, result); - /* - * Emulating the instruction which caused the #VC exception - * failed - can't continue so print debug information - */ - BUG(); - } - - return ret; -} - -static __always_inline bool vc_is_db(unsigned long error_code) -{ - return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB; -} - -/* - * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode - * and will panic when an error happens. - */ -DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) -{ - irqentry_state_t irq_state; - - /* - * With the current implementation it is always possible to switch to a - * safe stack because #VC exceptions only happen at known places, like - * intercepted instructions or accesses to MMIO areas/IO ports. They can - * also happen with code instrumentation when the hypervisor intercepts - * #DB, but the critical paths are forbidden to be instrumented, so #DB - * exceptions currently also only happen in safe places. - * - * But keep this here in case the noinstr annotations are violated due - * to bug elsewhere. - */ - if (unlikely(vc_from_invalid_context(regs))) { - instrumentation_begin(); - panic("Can't handle #VC exception from unsupported context\n"); - instrumentation_end(); - } - - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - exc_debug(regs); - return; - } - - irq_state = irqentry_nmi_enter(regs); - - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* Show some debug info */ - show_regs(regs); - - /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - - /* If that fails and we get here - just panic */ - panic("Returned from Terminate-Request to Hypervisor\n"); - } - - instrumentation_end(); - irqentry_nmi_exit(regs, irq_state); -} - -/* - * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode - * and will kill the current task with SIGBUS when an error happens. - */ -DEFINE_IDTENTRY_VC_USER(exc_vmm_communication) -{ - /* - * Handle #DB before calling into !noinstr code to avoid recursive #DB. - */ - if (vc_is_db(error_code)) { - noist_exc_debug(regs); - return; - } - - irqentry_enter_from_user_mode(regs); - instrumentation_begin(); - - if (!vc_raw_handle_exception(regs, error_code)) { - /* - * Do not kill the machine if user-space triggered the - * exception. Send SIGBUS instead and let user-space deal with - * it. - */ - force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0); - } - - instrumentation_end(); - irqentry_exit_to_user_mode(regs); -} - -bool __init handle_vc_boot_ghcb(struct pt_regs *regs) -{ - unsigned long exit_code = regs->orig_ax; - struct es_em_ctxt ctxt; - enum es_result result; - - vc_ghcb_invalidate(boot_ghcb); - - result = vc_init_em_ctxt(&ctxt, regs, exit_code); - if (result == ES_OK) - result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code); - - /* Done - now check the result */ - switch (result) { - case ES_OK: - vc_finish_insn(&ctxt); - break; - case ES_UNSUPPORTED: - early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_VMM_ERROR: - early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_DECODE_FAILED: - early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n", - exit_code, regs->ip); - goto fail; - case ES_EXCEPTION: - vc_early_forward_exception(&ctxt); - break; - case ES_RETRY: - /* Nothing to do */ - break; - default: - BUG(); - } - - return true; - -fail: - show_regs(regs); - - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); -} - -/* - * Initial set up of SNP relies on information provided by the - * Confidential Computing blob, which can be passed to the kernel - * in the following ways, depending on how it is booted: - * - * - when booted via the boot/decompress kernel: - * - via boot_params - * - * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): - * - via a setup_data entry, as defined by the Linux Boot Protocol - * - * Scan for the blob in that order. - */ -static __head struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - /* Boot kernel would have passed the CC blob via boot_params. */ - if (bp->cc_blob_address) { - cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; - goto found_cc_info; - } - - /* - * If kernel was booted directly, without the use of the - * boot/decompression kernel, the CC blob may have been passed via - * setup_data instead. - */ - cc_info = find_cc_blob_setup_data(bp); - if (!cc_info) - return NULL; - -found_cc_info: - if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) - snp_abort(); - - return cc_info; -} - -bool __head snp_init(struct boot_params *bp) -{ - struct cc_blob_sev_info *cc_info; - - if (!bp) - return false; - - cc_info = find_cc_blob(bp); - if (!cc_info) - return false; - - setup_cpuid_table(cc_info); - - /* - * The CC blob will be used later to access the secrets page. Cache - * it here like the boot kernel does. - */ - bp->cc_blob_address = (u32)(unsigned long)cc_info; - - return true; -} - -void __head __noreturn snp_abort(void) -{ - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); -} - -static void dump_cpuid_table(void) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - int i = 0; - - pr_info("count=%d reserved=0x%x reserved2=0x%llx\n", - cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2); - - for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) { - const struct snp_cpuid_fn *fn = &cpuid_table->fn[i]; - - pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n", - i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx, - fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved); - } -} - -/* - * It is useful from an auditing/testing perspective to provide an easy way - * for the guest owner to know that the CPUID table has been initialized as - * expected, but that initialization happens too early in boot to print any - * sort of indicator, and there's not really any other good place to do it, - * so do it here. - */ -static int __init report_cpuid_table(void) -{ - const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); - - if (!cpuid_table->count) - return 0; - - pr_info("Using SNP CPUID table, %d entries present.\n", - cpuid_table->count); - - if (sev_cfg.debug) - dump_cpuid_table(); - - return 0; -} -arch_initcall(report_cpuid_table); - -static int __init init_sev_config(char *str) -{ - char *s; - - while ((s = strsep(&str, ","))) { - if (!strcmp(s, "debug")) { - sev_cfg.debug = true; - continue; - } - - pr_info("SEV command-line option '%s' was not recognized\n", s); - } - - return 1; -} -__setup("sev=", init_sev_config); - -int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio) -{ - struct ghcb_state state; - struct es_em_ctxt ctxt; - unsigned long flags; - struct ghcb *ghcb; - int ret; - - rio->exitinfo2 = SEV_RET_NO_FW_CALL; - - /* - * __sev_get_ghcb() needs to run with IRQs disabled because it is using - * a per-CPU GHCB. - */ - local_irq_save(flags); - - ghcb = __sev_get_ghcb(&state); - if (!ghcb) { - ret = -EIO; - goto e_restore_irq; - } - - vc_ghcb_invalidate(ghcb); - - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { - ghcb_set_rax(ghcb, input->data_gpa); - ghcb_set_rbx(ghcb, input->data_npages); - } - - ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa); - if (ret) - goto e_put; - - rio->exitinfo2 = ghcb->save.sw_exit_info_2; - switch (rio->exitinfo2) { - case 0: - break; - - case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY): - ret = -EAGAIN; - break; - - case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN): - /* Number of expected pages are returned in RBX */ - if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { - input->data_npages = ghcb_get_rbx(ghcb); - ret = -ENOSPC; - break; - } - fallthrough; - default: - ret = -EIO; - break; - } - -e_put: - __sev_put_ghcb(&state); -e_restore_irq: - local_irq_restore(flags); - - return ret; -} -EXPORT_SYMBOL_GPL(snp_issue_guest_request); - -static struct platform_device sev_guest_device = { - .name = "sev-guest", - .id = -1, -}; - -static int __init snp_init_platform_device(void) -{ - struct sev_guest_platform_data data; - u64 gpa; - - if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) - return -ENODEV; - - gpa = get_secrets_page(); - if (!gpa) - return -ENODEV; - - data.secrets_gpa = gpa; - if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) - return -ENODEV; - - if (platform_device_register(&sev_guest_device)) - return -ENODEV; - - pr_info("SNP guest platform device initialized.\n"); - return 0; -} -device_initcall(snp_init_platform_device); - -void kdump_sev_callback(void) -{ - /* - * Do wbinvd() on remote CPUs when SNP is enabled in order to - * safely do SNP_SHUTDOWN on the local CPU. - */ - if (cpu_feature_enabled(X86_FEATURE_SEV_SNP)) - wbinvd(); -} - -void sev_show_status(void) -{ - int i; - - pr_info("Status: "); - for (i = 0; i < MSR_AMD64_SNP_RESV_BIT; i++) { - if (sev_status & BIT_ULL(i)) { - if (!sev_status_feat_names[i]) - continue; - - pr_cont("%s ", sev_status_feat_names[i]); - } - } - pr_cont("\n"); -} diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 59e15dd8d0f8..2ddf23387c7e 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -163,8 +163,8 @@ static int shstk_setup(void) if (features_enabled(ARCH_SHSTK_SHSTK)) return 0; - /* Also not supported for 32 bit and x32 */ - if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall()) + /* Also not supported for 32 bit */ + if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall()) return -EOPNOTSUPP; size = adjust_shstk_size(0); @@ -173,8 +173,8 @@ static int shstk_setup(void) return PTR_ERR((void *)addr); fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, addr + size); - wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + wrmsrq(MSR_IA32_PL3_SSP, addr + size); + wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; @@ -239,7 +239,7 @@ static unsigned long get_user_shstk_addr(void) fpregs_lock_and_load(); - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); @@ -372,7 +372,7 @@ int setup_signal_shadow_stack(struct ksignal *ksig) return -EFAULT; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -396,7 +396,7 @@ int restore_signal_shadow_stack(void) return err; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -460,7 +460,7 @@ static int wrss_control(bool enable) return 0; fpregs_lock_and_load(); - rdmsrl(MSR_IA32_U_CET, msrval); + rdmsrq(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); @@ -473,7 +473,7 @@ static int wrss_control(bool enable) msrval &= ~CET_WRSS_EN; } - wrmsrl(MSR_IA32_U_CET, msrval); + wrmsrq(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); @@ -492,8 +492,8 @@ static int shstk_disable(void) fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ - wrmsrl(MSR_IA32_U_CET, 0); - wrmsrl(MSR_IA32_PL3_SSP, 0); + wrmsrq(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); @@ -577,3 +577,19 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) return wrss_control(true); return -EINVAL; } + +int shstk_update_last_frame(unsigned long val) +{ + unsigned long ssp; + + if (!features_enabled(ARCH_SHSTK_SHSTK)) + return 0; + + ssp = get_user_shstk_addr(); + return write_user_shstk_64((u64 __user *)ssp, (u64)val); +} + +bool shstk_is_enabled(void) +{ + return features_enabled(ARCH_SHSTK_SHSTK); +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 31b6f5dddfc2..2404233336ab 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig) } /* + * Enable all pkeys temporarily, so as to ensure that both the current + * execution stack as well as the alternate signal stack are writeable. + * The application can use any of the available pkeys to protect the + * alternate signal stack, and we don't know which one it is, so enable + * all. The PKRU register will be reset to init_pkru later in the flow, + * in fpu__clear_user_states(), and it is the application's responsibility + * to enable the appropriate pkey as the first step in the signal handler + * so that the handler does not segfault. + */ +static inline u32 sig_prepare_pkru(void) +{ + u32 orig_pkru = read_pkru(); + + write_pkru(0); + return orig_pkru; +} + +/* * Set up a signal frame. */ @@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, unsigned long math_size = 0; unsigned long sp = regs->sp; unsigned long buf_fx = 0; + u32 pkru; /* redzone */ if (!ia32_frame) @@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; } + /* Update PKRU to enable access to the alternate signal stack. */ + pkru = sig_prepare_pkru(); /* save i387 and extended state */ - if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size)) + if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) { + /* + * Restore PKRU to the original, user-defined value; disable + * extra pkeys enabled for the alternate signal stack, if any. + */ + write_pkru(pkru); return (void __user *)-1L; + } return (void __user *)sp; } @@ -228,7 +255,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); @@ -396,14 +423,14 @@ bool sigaltstack_size_valid(size_t ss_size) if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; - fsize += current->group_leader->thread.fpu.perm.__user_state_size; + fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; - mask = current->group_leader->thread.fpu.perm.__state_perm; + mask = x86_task_fpu(current->group_leader)->perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index c12624bc82a3..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -33,25 +33,55 @@ #include <asm/smap.h> #include <asm/gsseg.h> +/* + * The first GDT descriptor is reserved as 'NULL descriptor'. As bits 0 + * and 1 of a segment selector, i.e., the RPL bits, are NOT used to index + * GDT, selector values 0~3 all point to the NULL descriptor, thus values + * 0, 1, 2 and 3 are all valid NULL selector values. + * + * However IRET zeros ES, FS, GS, and DS segment registers if any of them + * is found to have any nonzero NULL selector value, which can be used by + * userspace in pre-FRED systems to spot any interrupt/exception by loading + * a nonzero NULL selector and waiting for it to become zero. Before FRED + * there was nothing software could do to prevent such an information leak. + * + * ERETU, the only legit instruction to return to userspace from kernel + * under FRED, by design does NOT zero any segment register to avoid this + * problem behavior. + * + * As such, leave NULL selector values 0~3 unchanged. + */ +static inline u16 fixup_rpl(u16 sel) +{ + return sel <= 3 ? sel : sel | 3; +} + #ifdef CONFIG_IA32_EMULATION -#include <asm/ia32_unistd.h> +#include <asm/unistd_32_ia32.h> static inline void reload_segments(struct sigcontext_32 *sc) { - unsigned int cur; + u16 cur; + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ savesegment(gs, cur); - if ((sc->gs | 0x03) != cur) - load_gs_index(sc->gs | 0x03); + if (fixup_rpl(sc->gs) != cur) + load_gs_index(fixup_rpl(sc->gs)); savesegment(fs, cur); - if ((sc->fs | 0x03) != cur) - loadsegment(fs, sc->fs | 0x03); + if (fixup_rpl(sc->fs) != cur) + loadsegment(fs, fixup_rpl(sc->fs)); + savesegment(ds, cur); - if ((sc->ds | 0x03) != cur) - loadsegment(ds, sc->ds | 0x03); + if (fixup_rpl(sc->ds) != cur) + loadsegment(ds, fixup_rpl(sc->ds)); savesegment(es, cur); - if ((sc->es | 0x03) != cur) - loadsegment(es, sc->es | 0x03); + if (fixup_rpl(sc->es) != cur) + loadsegment(es, fixup_rpl(sc->es)); } #define sigset32_t compat_sigset_t @@ -105,18 +135,12 @@ static bool ia32_restore_sigcontext(struct pt_regs *regs, regs->orig_ax = -1; #ifdef CONFIG_IA32_EMULATION - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ reload_segments(&sc); #else - loadsegment(gs, sc.gs); - regs->fs = sc.fs; - regs->es = sc.es; - regs->ds = sc.ds; + loadsegment(gs, fixup_rpl(sc.gs)); + regs->fs = fixup_rpl(sc.fs); + regs->es = fixup_rpl(sc.es); + regs->ds = fixup_rpl(sc.ds); #endif return fpu__restore_sig(compat_ptr(sc.fpstate), 1); @@ -128,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -151,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 23d8aaf8d9fd..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -260,13 +262,13 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); - if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) + if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - if (restore_signal_shadow_stack()) + if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; - if (restore_altstack(&frame->uc.uc_stack)) + if (restore_signal_shadow_stack()) goto badframe; return regs->ax; @@ -315,6 +317,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) uc_flags = frame_uc_flags(regs); + if (setup_signal_shadow_stack(ksig)) + return -EFAULT; + if (!user_access_begin(frame, sizeof(*frame))) return -EFAULT; @@ -363,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) @@ -377,6 +384,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags)) goto badframe; + if (restore_signal_shadow_stack()) + goto badframe; + if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18266cc3d98c..b014e6d229f9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -299,3 +299,27 @@ struct smp_ops smp_ops = { .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); + +int arch_cpu_rescan_dead_smt_siblings(void) +{ + enum cpuhp_smt_control old = cpu_smt_control; + int ret; + + /* + * If SMT has been disabled and SMT siblings are in HLT, bring them back + * online and offline them again so that they end up in MWAIT proper. + * + * Called with hotplug enabled. + */ + if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED) + return 0; + + ret = cpuhp_smt_enable(); + if (ret) + return ret; + + ret = cpuhp_smt_disable(old); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 76bb65045c64..58ede3fa6a75 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -60,9 +60,11 @@ #include <linux/stackprotector.h> #include <linux/cpuhotplug.h> #include <linux/mc146818rtc.h> +#include <linux/acpi.h> #include <asm/acpi.h> #include <asm/cacheinfo.h> +#include <asm/cpuid/api.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -188,7 +190,7 @@ static void ap_starting(void) apic_ap_setup(); /* Save the processor parameters. */ - smp_store_cpu_info(cpuid); + identify_secondary_cpu(cpuid); /* * The topology information must be up to date before @@ -213,7 +215,7 @@ static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. - * smp_store_cpu_info() stored a value that is close but not as + * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, @@ -227,7 +229,7 @@ static void ap_calibrate_delay(void) /* * Activate a secondary processor. */ -static void notrace start_secondary(void *unused) +static void notrace __noendbr start_secondary(void *unused) { /* * Don't put *anything* except direct CPU state initialization @@ -246,7 +248,7 @@ static void notrace start_secondary(void *unused) __flush_tlb_all(); } - cpu_init_exception_handling(); + cpu_init_exception_handling(false); /* * Load the microcode before reaching the AP alive synchronization @@ -312,26 +314,7 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ -void smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = &cpu_data(id); - - /* Copy boot_cpu_data only on the first bringup */ - if (!c->initialized) - *c = boot_cpu_data; - c->cpu_index = id; - /* - * During boot time, CPU0 has this setup already. Save the info when - * bringing up an AP. - */ - identify_secondary_cpu(c); - c->initialized = true; -} +ANNOTATE_NOENDBR_SYM(start_secondary); static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) @@ -438,9 +421,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) */ static const struct x86_cpu_id intel_cod_cpu[] = { - X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ - X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ + X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */ + X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */ {} }; @@ -481,12 +464,6 @@ static int x86_core_flags(void) return cpu_core_flags() | x86_sched_itmt_flags(); } #endif -#ifdef CONFIG_SCHED_SMT -static int x86_smt_flags(void) -{ - return cpu_smt_flags(); -} -#endif #ifdef CONFIG_SCHED_CLUSTER static int x86_cluster_flags(void) { @@ -494,14 +471,6 @@ static int x86_cluster_flags(void) } #endif -static int x86_die_flags(void) -{ - if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return x86_sched_itmt_flags(); - - return 0; -} - /* * Set if a package/die has multiple NUMA nodes inside. * AMD Magny-Cours, Intel Cluster-on-Die, and Intel @@ -517,7 +486,7 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) + cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }; #endif #ifdef CONFIG_SCHED_CLUSTER @@ -537,7 +506,7 @@ static void __init build_sched_topology(void) */ if (!x86_has_numa_in_package) { x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG) + cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) }; } @@ -666,10 +635,9 @@ static void impress_friends(void) * But that slows boot and resume on modern processors, which include * many cores and don't require that delay. * - * Cmdline "init_cpu_udelay=" is available to over-ride this delay. - * Modern processor families are quirked to remove the delay entirely. + * Cmdline "cpu_init_udelay=" is available to override this delay. */ -#define UDELAY_10MS_DEFAULT 10000 +#define UDELAY_10MS_LEGACY 10000 static unsigned int init_udelay = UINT_MAX; @@ -681,21 +649,21 @@ static int __init cpu_init_udelay(char *str) } early_param("cpu_init_udelay", cpu_init_udelay); -static void __init smp_quirk_init_udelay(void) +static void __init smp_set_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ - if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86_vfm >= INTEL_PENTIUM_PRO) || + (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON && boot_cpu_data.x86 >= 0x18) || + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xF)) { init_udelay = 0; return; } /* else, use legacy delay */ - init_udelay = UDELAY_10MS_DEFAULT; + init_udelay = UDELAY_10MS_LEGACY; } /* @@ -727,7 +695,7 @@ static void send_init_sequence(u32 phys_apicid) /* * Wake up AP by INIT, INIT, STARTUP sequence. */ -static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip) +static int wakeup_secondary_cpu_via_init(u32 phys_apicid, unsigned long start_eip, unsigned int cpu) { unsigned long send_status = 0, accept_status = 0; int num_starts, j, maxlvt; @@ -853,7 +821,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); - per_cpu(pcpu_hot.current_task, cpu) = idle; + per_cpu(current_task, cpu) = idle; cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ @@ -863,7 +831,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle); + per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #endif return 0; } @@ -874,7 +842,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) * Returns zero if startup was successfully sent, else error code from * ->wakeup_secondary_cpu. */ -static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) +static int do_boot_cpu(u32 apicid, unsigned int cpu, struct task_struct *idle) { unsigned long start_ip = real_mode_header->trampoline_start; int ret; @@ -928,11 +896,11 @@ static int do_boot_cpu(u32 apicid, int cpu, struct task_struct *idle) * - Use an INIT boot APIC message */ if (apic->wakeup_secondary_cpu_64) - ret = apic->wakeup_secondary_cpu_64(apicid, start_ip); + ret = apic->wakeup_secondary_cpu_64(apicid, start_ip, cpu); else if (apic->wakeup_secondary_cpu) - ret = apic->wakeup_secondary_cpu(apicid, start_ip); + ret = apic->wakeup_secondary_cpu(apicid, start_ip, cpu); else - ret = wakeup_secondary_cpu_via_init(apicid, start_ip); + ret = wakeup_secondary_cpu_via_init(apicid, start_ip, cpu); /* If the wakeup mechanism failed, cleanup the warm reset vector */ if (ret) @@ -1033,20 +1001,22 @@ static __init void disable_smp(void) void __init smp_prepare_cpus_common(void) { - unsigned int i; + unsigned int cpu, node; /* Mark all except the boot CPU as hotpluggable */ - for_each_possible_cpu(i) { - if (i) - per_cpu(cpu_info.cpu_index, i) = nr_cpu_ids; + for_each_possible_cpu(cpu) { + if (cpu) + per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids; } - for_each_possible_cpu(i) { - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL); + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + + zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node); + zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node); } set_cpu_sibling_map(0); @@ -1104,7 +1074,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) uv_system_init(); - smp_quirk_init_udelay(); + smp_set_init_udelay(); speculative_store_bypass_ht_init(); @@ -1218,6 +1188,12 @@ void cpu_disable_common(void) remove_siblinginfo(cpu); + /* + * Stop allowing kernel-mode FPU. This is needed so that if the CPU is + * brought online again, the initial state is not allowed: + */ + this_cpu_write(kernel_fpu_allowed, false); + /* It's now safe to remove this processor from the online map */ lock_vector_lock(); remove_cpu_from_maps(cpu); @@ -1272,45 +1248,9 @@ void play_dead_common(void) * We need to flush the caches before going to sleep, lest we have * dirty data in our caches when we come back up. */ -static inline void mwait_play_dead(void) +void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) - return; - - eax = CPUID_MWAIT_LEAF; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; @@ -1331,7 +1271,7 @@ static inline void mwait_play_dead(void) mb(); __monitor(md, 0, 0); mb(); - __mwait(eax, 0); + __mwait(eax_hint, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* @@ -1403,9 +1343,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 4eefaac64c6c..378c388d1b31 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -81,7 +81,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, break; case RET: - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; @@ -90,7 +90,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, case JCC: if (!func) { func = __static_call_return; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk()) func = x86_return_thunk; } @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) @@ -158,7 +158,7 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) { mutex_lock(&text_mutex); - if (tramp) { + if (tramp && !site) { __static_call_validate(tramp, true, true); __static_call_transform(tramp, __sc_insn(!func, true), func, false); } @@ -172,6 +172,14 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index cb9fa1d5c66f..776ae6fa7f2d 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,6 +18,7 @@ #include <linux/random.h> #include <linux/uaccess.h> #include <linux/elf.h> +#include <linux/hugetlb.h> #include <asm/elf.h> #include <asm/ia32.h> @@ -25,8 +26,10 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. */ -static unsigned long get_align_mask(void) +static unsigned long get_align_mask(struct file *filp) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) return 0; @@ -49,7 +52,7 @@ static unsigned long get_align_mask(void) */ static unsigned long get_align_bits(void) { - return va_align.bits & get_align_mask(); + return va_align.bits & get_align_mask(NULL); } static int __init control_va_addr_alignment(char *str) @@ -112,13 +115,21 @@ static void find_start_end(unsigned long addr, unsigned long flags, *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); } +static inline unsigned long stack_guard_placement(vm_flags_t vm_flags) +{ + if (vm_flags & VM_SHADOW_STACK) + return PAGE_SIZE; + + return 0; +} + unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; unsigned long begin, end; if (flags & MAP_FIXED) @@ -137,28 +148,30 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, return addr; } - info.flags = 0; info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) { + info.align_offset = pgoff << PAGE_SHIFT; + info.start_gap = stack_guard_placement(vm_flags); + } if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } + return vm_unmapped_area(&info); } unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long addr = addr0; - struct vm_unmapped_area_info info; + struct vm_unmapped_area_info info = {}; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -192,6 +205,10 @@ get_unmapped_area: info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); + if (!(filp && is_file_hugepages(filp))) { + info.start_gap = stack_guard_placement(vm_flags); + info.align_offset = pgoff << PAGE_SHIFT; + } /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area @@ -203,10 +220,8 @@ get_unmapped_area: if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - info.align_mask = 0; - info.align_offset = pgoff << PAGE_SHIFT; if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } addr = vm_unmapped_area(&info); @@ -221,5 +236,5 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0); } diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 4c1bcb6053fc..46b8f1f16676 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -200,8 +200,7 @@ static int tboot_setup_sleep(void) tboot->num_mac_regions = 0; for (i = 0; i < e820_table->nr_entries; i++) { - if ((e820_table->entries[i].type != E820_TYPE_RAM) - && (e820_table->entries[i].type != E820_TYPE_RESERVED_KERN)) + if (e820_table->entries[i].type != E820_TYPE_RAM) continue; add_mac_region(e820_table->entries[i].addr, e820_table->entries[i].size); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e42faa792c07..52e1f3f0b361 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -27,25 +27,7 @@ unsigned long profile_pc(struct pt_regs *regs) { - unsigned long pc = instruction_pointer(regs); - - if (!user_mode(regs) && in_lock_functions(pc)) { -#ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + sizeof(long)); -#else - unsigned long *sp = (unsigned long *)regs->sp; - /* - * Return address is either directly at stack pointer - * or above a saved flags. Eflags has bits 22-31 zero, - * kernel addresses don't. - */ - if (sp[0] >> 22) - return sp[0]; - if (sp[1] >> 22) - return sp[1]; -#endif - } - return pc; + return instruction_pointer(regs); } EXPORT_SYMBOL(profile_pc); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c deleted file mode 100644 index d42c28b8bfd8..000000000000 --- a/arch/x86/kernel/topology.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Populate sysfs with topology information - * - * Written by: Matthew Dobson, IBM Corporation - * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - */ -#include <linux/interrupt.h> -#include <linux/nodemask.h> -#include <linux/export.h> -#include <linux/mmzone.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <asm/io_apic.h> -#include <asm/cpu.h> - -#ifdef CONFIG_HOTPLUG_CPU -bool arch_cpu_is_hotpluggable(int cpu) -{ - return cpu > 0; -} -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index b8e7abe00b06..708d61743d15 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -4,7 +4,7 @@ */ #include <asm/trace_clock.h> #include <asm/barrier.h> -#include <asm/msr.h> +#include <asm/tsc.h> /* * trace_clock_x86_tsc(): A clock that is just the cycle counter. diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c deleted file mode 100644 index 03ae1caaa878..000000000000 --- a/arch/x86/kernel/tracepoint.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> - */ -#include <linux/jump_label.h> -#include <linux/atomic.h> - -#include <asm/trace/exceptions.h> - -DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); - -int trace_pagefault_reg(void) -{ - static_branch_inc(&trace_pagefault_key); - return 0; -} - -void trace_pagefault_unreg(void) -{ - static_branch_dec(&trace_pagefault_key); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4fa0b17e5043..36354b470590 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,6 +42,7 @@ #include <linux/hardirq.h> #include <linux/atomic.h> #include <linux/iommu.h> +#include <linux/ubsan.h> #include <asm/stacktrace.h> #include <asm/processor.h> @@ -67,6 +68,7 @@ #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -91,6 +93,96 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, further decode to determine its use: + * + * FineIBT: ea (bad) + * FineIBT: f0 75 f9 lock jne . - 6 + * UBSan{0}: 67 0f b9 00 ud1 (%eax),%eax + * UBSan{10}: 67 0f b9 40 10 ud1 0x10(%eax),%eax + * static_call: 0f b9 cc ud1 %esp,%ecx + * + * Notably UBSAN uses EAX, static_call uses ECX. + */ +__always_inline int decode_bug(unsigned long addr, s32 *imm, int *len) +{ + unsigned long start = addr; + bool lock = false; + u8 v; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + v = *(u8 *)(addr++); + + if (v == INSN_LOCK) { + lock = true; + v = *(u8 *)(addr++); + } + + switch (v) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + addr += 1; /* d8 */ + *len = addr - start; + WARN_ON_ONCE(!lock); + return BUG_LOCK; + + case 0xea: + *len = addr - start; + return BUG_EA; + + case OPCODE_ESCAPE: + break; + + default: + return BUG_NONE; + } + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) { + *len = addr - start; + return BUG_UD2; + } + + if (v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + *imm = 0; + v = *(u8 *)(addr++); /* ModRM */ + + if (X86_MODRM_MOD(v) != 3 && X86_MODRM_RM(v) == 4) + addr++; /* SIB */ + + /* Decode immediate, if present */ + switch (X86_MODRM_MOD(v)) { + case 0: if (X86_MODRM_RM(v) == 5) + addr += 4; /* RIP + disp32 */ + break; + + case 1: *imm = *(s8 *)addr; + addr += 1; + break; + + case 2: *imm = *(s32 *)addr; + addr += 4; + break; + + case 3: break; + } + + /* record instruction length */ + *len = addr - start; + + if (X86_MODRM_REG(v) == 0) /* EAX */ + return BUG_UD1_UBSAN; + + return BUG_UD1; +} + + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -215,15 +307,13 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { + unsigned long addr = regs->ip; bool handled = false; + int ud_type, ud_len; + s32 ud_imm; - /* - * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() - * is a rare case that uses @regs without passing them to - * irqentry_enter(). - */ - kmsan_unpoison_entry_regs(regs); - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(addr, &ud_imm, &ud_len); + if (ud_type == BUG_NONE) return handled; /* @@ -231,16 +321,58 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ instrumentation_begin(); /* + * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() + * is a rare case that uses @regs without passing them to + * irqentry_enter(). + */ + kmsan_unpoison_entry_regs(regs); + /* * Since we're emulating a CALL with exceptions, restore the interrupt * state to what it was at the exception site. */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + + switch (ud_type) { + case BUG_UD2: + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + fallthrough; + + case BUG_EA: + case BUG_LOCK: + if (handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + handled = true; + break; + } + break; + + case BUG_UD1_UBSAN: + if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", + report_ubsan_failure(ud_imm), + (void *)regs->ip); + } + break; + + default: + break; } + + /* + * When continuing, and regs->ip hasn't changed, move it to the next + * instruction. When not continuing execution, restore the instruction + * pointer. + */ + if (handled) { + if (regs->ip == addr) + regs->ip += ud_len; + } else { + regs->ip = addr; + } + if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); instrumentation_end(); @@ -331,6 +463,21 @@ __visible void __noreturn handle_stack_overflow(struct pt_regs *regs, #endif /* + * Prevent the compiler and/or objtool from marking the !CONFIG_X86_ESPFIX64 + * version of exc_double_fault() as noreturn. Otherwise the noreturn mismatch + * between configs triggers objtool warnings. + * + * This is a temporary hack until we have compiler or plugin support for + * annotating noreturns. + */ +#ifdef CONFIG_X86_ESPFIX64 +#define always_true() true +#else +bool always_true(void); +bool __weak always_true(void) { return true; } +#endif + +/* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * * On x86_64, this is more or less a normal kernel entry. Notwithstanding the @@ -465,7 +612,8 @@ DEFINE_IDTENTRY_DF(exc_double_fault) pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); die("double fault", regs, error_code); - panic("Machine halted."); + if (always_true()) + panic("Machine halted."); instrumentation_end(); } @@ -602,7 +750,7 @@ static bool try_fixup_enqcmd_gp(void) if (current->pasid_activated) return false; - wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; @@ -735,16 +883,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. @@ -874,24 +1022,32 @@ static bool is_sysenter_singlestep(struct pt_regs *regs) #endif } -static __always_inline unsigned long debug_read_clear_dr6(void) +static __always_inline unsigned long debug_read_reset_dr6(void) { unsigned long dr6; + get_debugreg(dr6, 6); + dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ + /* * The Intel SDM says: * - * Certain debug exceptions may clear bits 0-3. The remaining - * contents of the DR6 register are never cleared by the - * processor. To avoid confusion in identifying debug - * exceptions, debug handlers should clear the register before - * returning to the interrupted task. + * Certain debug exceptions may clear bits 0-3 of DR6. + * + * BLD induced #DB clears DR6.BLD and any other debug + * exception doesn't modify DR6.BLD. * - * Keep it simple: clear DR6 immediately. + * RTM induced #DB clears DR6.RTM and any other debug + * exception sets DR6.RTM. + * + * To avoid confusion in identifying debug exceptions, + * debug handlers should set DR6.BLD and DR6.RTM, and + * clear other DR6 bits before returning. + * + * Keep it simple: write DR6 with its architectural reset + * value 0xFFFF0FF0, defined as DR6_RESERVED, immediately. */ - get_debugreg(dr6, 6); set_debugreg(DR6_RESERVED, 6); - dr6 ^= DR6_RESERVED; /* Flip to positive polarity */ return dr6; } @@ -973,9 +1129,9 @@ static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) */ unsigned long debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* @@ -1091,13 +1247,13 @@ out: /* IST stack entry */ DEFINE_IDTENTRY_DEBUG(exc_debug) { - exc_debug_kernel(regs, debug_read_clear_dr6()); + exc_debug_kernel(regs, debug_read_reset_dr6()); } /* User entry, runs on regular task stack */ DEFINE_IDTENTRY_DEBUG_USER(exc_debug) { - exc_debug_user(regs, debug_read_clear_dr6()); + exc_debug_user(regs, debug_read_reset_dr6()); } #ifdef CONFIG_X86_FRED @@ -1116,7 +1272,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) { /* * FRED #DB stores DR6 on the stack in the format which - * debug_read_clear_dr6() returns for the IDT entry points. + * debug_read_reset_dr6() returns for the IDT entry points. */ unsigned long dr6 = fred_event_data(regs); @@ -1131,7 +1287,7 @@ DEFINE_FREDENTRY_DEBUG(exc_debug) /* 32 bit does not have separate entry points. */ DEFINE_IDTENTRY_RAW(exc_debug) { - unsigned long dr6 = debug_read_clear_dr6(); + unsigned long dr6 = debug_read_reset_dr6(); if (user_mode(regs)) exc_debug_user(regs, dr6); @@ -1148,7 +1304,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -1239,11 +1395,11 @@ static bool handle_xfd_event(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; - rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) @@ -1402,34 +1558,8 @@ DEFINE_IDTENTRY_SW(iret_error) } #endif -/* Do not enable FRED by default yet. */ -static bool enable_fred __ro_after_init = false; - -#ifdef CONFIG_X86_FRED -static int __init fred_setup(char *str) -{ - if (!str) - return -EINVAL; - - if (!cpu_feature_enabled(X86_FEATURE_FRED)) - return 0; - - if (!strcmp(str, "on")) - enable_fred = true; - else if (!strcmp(str, "off")) - enable_fred = false; - else - pr_warn("invalid FRED option: 'fred=%s'\n", str); - return 0; -} -early_param("fred", fred_setup); -#endif - void __init trap_init(void) { - if (cpu_feature_enabled(X86_FEATURE_FRED) && !enable_fred) - setup_clear_cpu_cap(X86_FEATURE_FRED); - /* Init cpu_entry_area before IST entries are set up */ setup_cpu_entry_areas(); @@ -1437,7 +1567,7 @@ void __init trap_init(void) sev_es_init_vc_handling(); /* Initialize TSS before setting up traps so ISTs work */ - cpu_init_exception_handling(); + cpu_init_exception_handling(true); /* Setup traps as cpu_init() might #GP */ if (!cpu_feature_enabled(X86_FEATURE_FRED)) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5a69a49acc96..87e749106dda 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -16,6 +16,7 @@ #include <linux/static_key.h> #include <linux/static_call.h> +#include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> @@ -26,9 +27,12 @@ #include <asm/x86_init.h> #include <asm/geode.h> #include <asm/apic.h> -#include <asm/intel-family.h> +#include <asm/cpu_device_id.h> #include <asm/i8259.h> +#include <asm/msr.h> +#include <asm/topology.h> #include <asm/uv/uv.h> +#include <asm/sev.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -44,15 +48,15 @@ EXPORT_SYMBOL(tsc_khz); static int __read_mostly tsc_unstable; static unsigned int __initdata tsc_early_khz; -static DEFINE_STATIC_KEY_FALSE(__use_tsc); +static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc); int tsc_clocksource_reliable; static int __read_mostly tsc_force_recalibrate; -static u32 art_to_tsc_numerator; -static u32 art_to_tsc_denominator; -static u64 art_to_tsc_offset; +static struct clocksource_base art_base_clk = { + .id = CSID_X86_ART, +}; static bool have_art; struct cyc2ns { @@ -173,10 +177,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts c2n = per_cpu_ptr(&cyc2ns, cpu); - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch_begin(&c2n->seq); c2n->data[0] = data; - raw_write_seqcount_latch(&c2n->seq); + write_seqcount_latch(&c2n->seq); c2n->data[1] = data; + write_seqcount_latch_end(&c2n->seq); } static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) @@ -663,13 +668,13 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x15) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return 0; eax_denominator = ebx_numerator = ecx_hz = edx = 0; /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ - cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); if (ebx_numerator == 0 || eax_denominator == 0) return 0; @@ -678,11 +683,11 @@ unsigned long native_calibrate_tsc(void) /* * Denverton SoCs don't report crystal clock, and also don't support - * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal - * clock. + * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz + * crystal clock. */ if (crystal_khz == 0 && - boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D) + boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D) crystal_khz = 25000; /* @@ -698,10 +703,10 @@ unsigned long native_calibrate_tsc(void) * clock, but we can easily calculate it to a high degree of accuracy * by considering the crystal ratio and the CPU speed. */ - if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) { unsigned int eax_base_mhz, ebx, ecx, edx; - cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx); crystal_khz = eax_base_mhz * 1000 * eax_denominator / ebx_numerator; } @@ -713,7 +718,7 @@ unsigned long native_calibrate_tsc(void) * For Atom SoCs TSC is the only reliable clocksource. * Mark TSC reliable so no watchdog on it. */ - if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); #ifdef CONFIG_X86_LOCAL_APIC @@ -736,12 +741,12 @@ static unsigned long cpu_khz_from_cpuid(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; - if (boot_cpu_data.cpuid_level < 0x16) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ) return 0; eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; - cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); return eax_base_mhz * 1000; } @@ -955,7 +960,7 @@ static unsigned long long cyc2ns_suspend; void tsc_save_sched_clock_state(void) { - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; cyc2ns_suspend = sched_clock(); @@ -975,7 +980,7 @@ void tsc_restore_sched_clock_state(void) unsigned long flags; int cpu; - if (!sched_clock_stable()) + if (!static_branch_likely(&__use_tsc) && !sched_clock_stable()) return; local_irq_save(flags); @@ -1065,18 +1070,16 @@ core_initcall(cpufreq_register_tsc_scaling); #endif /* CONFIG_CPU_FREQ */ -#define ART_CPUID_LEAF (0x15) #define ART_MIN_DENOMINATOR (1) - /* * If ART is present detect the numerator:denominator to convert to TSC */ static void __init detect_art(void) { - unsigned int unused[2]; + unsigned int unused; - if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) + if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC) return; /* @@ -1089,13 +1092,14 @@ static void __init detect_art(void) tsc_async_resets) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); + cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator, + &art_base_clk.numerator, &art_base_clk.freq_khz, &unused); - if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) + art_base_clk.freq_khz /= KHZ; + if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); @@ -1252,15 +1256,12 @@ static void __init check_system_tsc_reliable(void) * - TSC which does not stop in C-States * - the TSC_ADJUST register which allows to detect even minimal * modifications - * - not more than two sockets. As the number of sockets cannot be - * evaluated at the early boot stage where this has to be - * invoked, check the number of online memory nodes as a - * fallback solution which is an reasonable estimate. + * - not more than four packages */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) && boot_cpu_has(X86_FEATURE_TSC_ADJUST) && - nr_online_nodes <= 4) + topology_max_packages() <= 4) tsc_disable_clocksource_watchdog(); } @@ -1289,74 +1290,13 @@ int unsynchronized_tsc(void) */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ - if (num_possible_cpus() > 1) + if (topology_max_packages() > 1) return 1; } return 0; } -/* - * Convert ART to TSC given numerator/denominator found in detect_art() - */ -struct system_counterval_t convert_art_to_tsc(u64 art) -{ - u64 tmp, res, rem; - - rem = do_div(art, art_to_tsc_denominator); - - res = art * art_to_tsc_numerator; - tmp = rem * art_to_tsc_numerator; - - do_div(tmp, art_to_tsc_denominator); - res += tmp + art_to_tsc_offset; - - return (struct system_counterval_t) { - .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, - .cycles = res, - }; -} -EXPORT_SYMBOL(convert_art_to_tsc); - -/** - * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC. - * @art_ns: ART (Always Running Timer) in unit of nanoseconds - * - * PTM requires all timestamps to be in units of nanoseconds. When user - * software requests a cross-timestamp, this function converts system timestamp - * to TSC. - * - * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set - * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check - * that this flag is set before conversion to TSC is attempted. - * - * Return: - * struct system_counterval_t - system counter value with the ID of the - * corresponding clocksource: - * cycles: System counter value - * cs_id: The clocksource ID for validating comparability - */ - -struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns) -{ - u64 tmp, res, rem; - - rem = do_div(art_ns, USEC_PER_SEC); - - res = art_ns * tsc_khz; - tmp = rem * tsc_khz; - - do_div(tmp, USEC_PER_SEC); - res += tmp; - - return (struct system_counterval_t) { - .cs_id = have_art ? CSID_X86_TSC : CSID_GENERIC, - .cycles = res, - }; -} -EXPORT_SYMBOL(convert_art_ns_to_tsc); - - static void tsc_refine_calibration_work(struct work_struct *work); static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); /** @@ -1458,8 +1398,10 @@ out: if (tsc_unstable) goto unreg; - if (boot_cpu_has(X86_FEATURE_ART)) + if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); unreg: clocksource_unregister(&clocksource_tsc_early); @@ -1484,8 +1426,10 @@ static int __init init_tsc_clocksource(void) * the refined calibration and directly register it as a clocksource. */ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { - if (boot_cpu_has(X86_FEATURE_ART)) + if (boot_cpu_has(X86_FEATURE_ART)) { have_art = true; + clocksource_tsc.base = &art_base_clk; + } clocksource_register_khz(&clocksource_tsc, tsc_khz); clocksource_unregister(&clocksource_tsc_early); @@ -1509,10 +1453,12 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - if (tsc_early_khz) + if (tsc_early_khz) { tsc_khz = tsc_early_khz; - else + } else { tsc_khz = x86_platform.calibrate_tsc(); + clocksource_tsc.freq_khz = tsc_khz; + } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); @@ -1570,6 +1516,9 @@ void __init tsc_early_init(void) /* Don't change UV TSC multi-chassis synchronization */ if (is_early_uv_system()) return; + + snp_secure_tsc_init(); + if (!determine_cpu_tsc_frequencies(true)) return; tsc_enable_sched_clock(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 6555a857a1e6..48e6cc1cb017 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = { }; static const struct x86_cpu_id tsc_msr_cpu_ids[] = { - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann), - X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw), + X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 1123ef3ccf90..ec3aa340d351 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/msr.h> #include <asm/tsc.h> struct tsc_adjust { @@ -65,12 +66,12 @@ void tsc_verify_tsc_adjust(bool resume) adj->nextcheck = jiffies + HZ; - rdmsrl(MSR_IA32_TSC_ADJUST, curval); + rdmsrq(MSR_IA32_TSC_ADJUST, curval); if (adj->adjusted == curval) return; /* Restore the original value */ - wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted); if (!adj->warned || resume) { pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", @@ -142,7 +143,7 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, if (likely(!tsc_async_resets)) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); + wrmsrq(MSR_IA32_TSC_ADJUST, 0); bootval = 0; } else { pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", @@ -165,7 +166,7 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (check_tsc_unstable()) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); @@ -187,17 +188,15 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; cur->warned = false; /* - * If a non-zero TSC value for socket 0 may be valid then the default - * adjusted value cannot assumed to be zero either. + * The default adjust value cannot be assumed to be zero on any socket. */ - if (tsc_async_resets) - cur->adjusted = bootval; + cur->adjusted = bootval; /* * Check whether this CPU is the first in a package to come up. In @@ -231,7 +230,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) */ if (bootval != ref->adjusted) { cur->adjusted = ref->adjusted; - wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted); } /* * We have the TSCs forced to be in sync on this package. Skip sync @@ -520,7 +519,7 @@ retry: pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); - wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted); goto retry; } diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index d00c28aaa5be..977ee75e047c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -476,7 +476,7 @@ bool unwind_next_frame(struct unwind_state *state) return false; /* Don't let modules unload while we're reading their ORC data. */ - preempt_disable(); + guard(rcu)(); /* End-of-stack check for user tasks: */ if (state->regs && user_mode(state->regs)) @@ -669,14 +669,12 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } - preempt_enable(); return true; err: state->error = true; the_end: - preempt_enable(); state->stack_info.type = STACK_TYPE_UNKNOWN; return false; } @@ -723,7 +721,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, state->sp = task->thread.sp + sizeof(*frame); state->bp = READ_ONCE_NOCHECK(frame->bp); state->ip = READ_ONCE_NOCHECK(frame->ret_addr); - state->signal = (void *)state->ip == ret_from_fork; + state->signal = (void *)state->ip == ret_from_fork_asm; } if (get_stack_info((unsigned long *)state->sp, state->task, diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c07f6daaa22..6d383839e839 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -12,6 +12,7 @@ #include <linux/ptrace.h> #include <linux/uprobes.h> #include <linux/uaccess.h> +#include <linux/syscalls.h> #include <linux/kdebug.h> #include <asm/processor.h> @@ -308,6 +309,126 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool } #ifdef CONFIG_X86_64 + +asm ( + ".pushsection .rodata\n" + ".global uretprobe_trampoline_entry\n" + "uretprobe_trampoline_entry:\n" + "pushq %rax\n" + "pushq %rcx\n" + "pushq %r11\n" + "movq $" __stringify(__NR_uretprobe) ", %rax\n" + "syscall\n" + ".global uretprobe_syscall_check\n" + "uretprobe_syscall_check:\n" + "popq %r11\n" + "popq %rcx\n" + + /* The uretprobe syscall replaces stored %rax value with final + * return address, so we don't restore %rax in here and just + * call ret. + */ + "retq\n" + ".global uretprobe_trampoline_end\n" + "uretprobe_trampoline_end:\n" + ".popsection\n" +); + +extern u8 uretprobe_trampoline_entry[]; +extern u8 uretprobe_trampoline_end[]; +extern u8 uretprobe_syscall_check[]; + +void *arch_uprobe_trampoline(unsigned long *psize) +{ + static uprobe_opcode_t insn = UPROBE_SWBP_INSN; + struct pt_regs *regs = task_pt_regs(current); + + /* + * At the moment the uretprobe syscall trampoline is supported + * only for native 64-bit process, the compat process still uses + * standard breakpoint. + */ + if (user_64bit_mode(regs)) { + *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry; + return uretprobe_trampoline_entry; + } + + *psize = UPROBE_SWBP_INSN_SIZE; + return &insn; +} + +static unsigned long trampoline_check_ip(unsigned long tramp) +{ + return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry); +} + +SYSCALL_DEFINE0(uretprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + unsigned long err, ip, sp, r11_cx_ax[3], tramp; + + /* If there's no trampoline, we are called from wrong place. */ + tramp = uprobe_get_trampoline_vaddr(); + if (unlikely(tramp == UPROBE_NO_TRAMPOLINE_VADDR)) + goto sigill; + + /* Make sure the ip matches the only allowed sys_uretprobe caller. */ + if (unlikely(regs->ip != trampoline_check_ip(tramp))) + goto sigill; + + err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax)); + if (err) + goto sigill; + + /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */ + regs->r11 = r11_cx_ax[0]; + regs->cx = r11_cx_ax[1]; + regs->ax = r11_cx_ax[2]; + regs->sp += sizeof(r11_cx_ax); + regs->orig_ax = -1; + + ip = regs->ip; + sp = regs->sp; + + uprobe_handle_trampoline(regs); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + * .. or shadow stack is enabled, in which case we need to skip + * return through the user space stack address. + */ + if (regs->sp != sp || shstk_is_enabled()) + return regs->ax; + regs->sp -= sizeof(r11_cx_ax); + + /* for the case uprobe_consumer has changed r11/cx */ + r11_cx_ax[0] = regs->r11; + r11_cx_ax[1] = regs->cx; + + /* + * ax register is passed through as return value, so we can use + * its space on stack for ip value and jump to it through the + * trampoline's ret instruction + */ + r11_cx_ax[2] = regs->ip; + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + + return regs->ax; + +sigill: + force_sig(SIGILL); + return -1; +} + /* * If arch_uprobe->insn doesn't use rip-relative addressing, return * immediately. Otherwise, rewrite the instruction so that it accesses @@ -719,6 +840,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; + /* x86_nops[insn->length]; same as jmp with .offs = 0 */ + if (insn->length <= ASM_NOP_MAX && + !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + goto setup; + switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ @@ -1076,8 +1202,13 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!nleft)) + if (likely(!nleft)) { + if (shstk_update_last_frame(trampoline_vaddr)) { + force_sig(SIGSEGV); + return -1; + } return orig_ret_vaddr; + } if (nleft != rasize) { pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 1258a5872d12..37ad43792452 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -29,8 +29,12 @@ */ #include <asm/cpufeatures.h> +#include <asm/cpufeaturemasks.h> #include <asm/msr-index.h> +#define SSE_MASK \ + (REQUIRED_MASK0 & ((1<<(X86_FEATURE_XMM & 31)) | (1<<(X86_FEATURE_XMM2 & 31)))) + SYM_FUNC_START_LOCAL(verify_cpu) pushf # Save caller passed flags push $0 # Kill any dangerous flags diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e9e803a4d44c..e6cc84143f3e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -246,9 +246,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) /* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */ if (v.flags & VM86_SCREEN_BITMAP) { - char comm[TASK_COMM_LEN]; - - pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current)); + pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", + current->comm); return -EINVAL; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 56451fd2099e..4fa0be732af1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -15,11 +15,7 @@ * put it inside the section definition. */ -#ifdef CONFIG_X86_32 -#define LOAD_OFFSET __PAGE_OFFSET -#else #define LOAD_OFFSET __START_KERNEL_map -#endif #define RUNTIME_DISCARD_EXIT #define EMITS_PT_NOTE @@ -32,6 +28,7 @@ #include <asm/orc_lookup.h> #include <asm/cache.h> #include <asm/boot.h> +#include <asm/kexec.h> #undef i386 /* in case the preprocessor is a 32bit one */ @@ -46,7 +43,8 @@ ENTRY(phys_startup_64) #endif jiffies = jiffies_64; -const_pcpu_hot = pcpu_hot; +const_current_task = current_task; +const_cpu_current_top_of_stack = cpu_current_top_of_stack; #if defined(CONFIG_X86_64) /* @@ -81,11 +79,13 @@ const_pcpu_hot = pcpu_hot; #define BSS_DECRYPTED \ . = ALIGN(PMD_SIZE); \ __start_bss_decrypted = .; \ + __pi___start_bss_decrypted = .; \ *(.bss..decrypted); \ . = ALIGN(PAGE_SIZE); \ __start_bss_decrypted_unused = .; \ . = ALIGN(PMD_SIZE); \ __end_bss_decrypted = .; \ + __pi___end_bss_decrypted = .; \ #else @@ -99,46 +99,39 @@ const_pcpu_hot = pcpu_hot; #define BSS_DECRYPTED #endif - +#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE) +#define KEXEC_RELOCATE_KERNEL \ + . = ALIGN(0x100); \ + __relocate_kernel_start = .; \ + *(.text..relocate_kernel); \ + *(.data..relocate_kernel); \ + __relocate_kernel_end = .; + +ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE, + "relocate_kernel code too large!") +#else +#define KEXEC_RELOCATE_KERNEL +#endif PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(6); /* RW_ */ -#ifdef CONFIG_X86_64 -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(6); /* RW_ */ -#endif - init PT_LOAD FLAGS(7); /* RWE */ -#endif note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { + . = __START_KERNEL; #ifdef CONFIG_X86_32 - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET); #else - . = __START_KERNEL; phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET); #endif /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + __pi__text = .; _stext = .; - /* bootstrapping code */ - HEAD_TEXT - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - SOFTIRQENTRY_TEXT -#ifdef CONFIG_MITIGATION_RETPOLINE - *(.text..__x86.indirect_thunk) - *(.text..__x86.return_thunk) -#endif - STATIC_CALL_TEXT - ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) ENTRY_TEXT @@ -152,10 +145,26 @@ SECTIONS *(.text..__x86.rethunk_safe) #endif ALIGN_ENTRY_TEXT_END + + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + SOFTIRQENTRY_TEXT +#ifdef CONFIG_MITIGATION_RETPOLINE + *(.text..__x86.indirect_thunk) + *(.text..__x86.return_thunk) +#endif + STATIC_CALL_TEXT *(.gnu.warning) } :text = 0xcccccccc + /* bootstrapping code */ + .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) { + HEAD_TEXT + } :text = 0xcccccccc + /* End of text section, which should occupy whole number of pages */ _etext = .; . = ALIGN(PAGE_SIZE); @@ -172,6 +181,9 @@ SECTIONS /* init_task */ INIT_TASK_DATA(THREAD_SIZE) + /* equivalent to task_pt_regs(&init_task) */ + __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE; + #ifdef CONFIG_X86_32 /* 32 bit has nosave before _edata */ NOSAVE_DATA @@ -179,10 +191,13 @@ SECTIONS PAGE_ALIGNED_DATA(PAGE_SIZE) + CACHE_HOT_DATA(L1_CACHE_BYTES) + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) DATA_DATA CONSTRUCTORS + KEXEC_RELOCATE_KERNEL /* rarely changed data like cpu maps */ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) @@ -195,50 +210,13 @@ SECTIONS ORC_UNWIND_TABLE - . = ALIGN(PAGE_SIZE); - __vvar_page = .; - - .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { - /* work around gold bug 13023 */ - __vvar_beginning_hack = .; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) \ - . = __vvar_beginning_hack + offset; \ - *(.vvar_ ## name) -#include <asm/vvar.h> -#undef EMIT_VVAR - - /* - * Pad the rest of the page with zeros. Otherwise the loader - * can leave garbage here. - */ - . = __vvar_beginning_hack + PAGE_SIZE; - } :data - - . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); - /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { __init_begin = .; /* paired with __init_end */ } -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - .init.text - should - * start another segment - init. - */ - PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) - ASSERT(SIZEOF(.data..percpu) < CONFIG_PHYSICAL_START, - "per-CPU data too large - increase CONFIG_PHYSICAL_START") -#endif - INIT_TEXT_SECTION(PAGE_SIZE) -#ifdef CONFIG_X86_64 - :init -#endif /* * Section for code used exclusively before alternatives are run. All @@ -355,9 +333,11 @@ SECTIONS EXIT_DATA } -#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU_SECTION(INTERNODE_CACHE_BYTES) -#endif + PERCPU_SECTION(L1_CACHE_BYTES) + ASSERT(__per_cpu_hot_end - __per_cpu_hot_start <= 64, "percpu cache hot data too large") + + RUNTIME_CONST_VARIABLES + RUNTIME_CONST(ptr, USER_PTR_MAX) . = ALIGN(PAGE_SIZE); @@ -414,6 +394,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; + __pi__end = .; #ifdef CONFIG_AMD_MEM_ENCRYPT /* @@ -442,6 +423,10 @@ SECTIONS STABS_DEBUG DWARF_DEBUG +#ifdef CONFIG_PROPELLER_CLANG + .llvm_bb_addr_map : { *(.llvm_bb_addr_map) } +#endif + ELF_DETAILS DISCARDS @@ -485,25 +470,23 @@ SECTIONS } /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and + * let's assert that fuller build coverage is more valuable than being able to + * run a COMPILE_TEST kernel. + */ +#ifndef CONFIG_COMPILE_TEST +/* + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif + +/* needed for Clang - see arch/x86/entry/entry.S */ +PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); #ifdef CONFIG_X86_64 -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(fixed_percpu_data); -INIT_PER_CPU(irq_stack_backing_store); - -#ifdef CONFIG_SMP -. = ASSERT((fixed_percpu_data == 0), - "fixed_percpu_data is not at start of per-cpu area"); -#endif #ifdef CONFIG_MITIGATION_UNRET_ENTRY . = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned"); @@ -526,4 +509,29 @@ INIT_PER_CPU(irq_stack_backing_store); "SRSO function pair won't alias"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline"); +. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart"); +. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); +#endif + +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline"); +#endif + #endif /* CONFIG_X86_64 */ + +/* + * The symbols below are referenced using relative relocations in the + * respective ELF notes. This produces build time constants that the + * linker will never mark as relocatable. (Using just ABSOLUTE() is not + * sufficient for that). + */ +#ifdef CONFIG_XEN_PV +xen_elfnote_entry_value = + ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen); +#endif +#ifdef CONFIG_PVH +xen_elfnote_phys32_entry_value = + ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET); +#endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a42830dc151b..0a2bbd674a6d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -3,10 +3,12 @@ * * For licencing details see kernel-base/COPYING */ +#include <linux/dmi.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/export.h> #include <linux/pci.h> +#include <linux/acpi.h> #include <asm/acpi.h> #include <asm/bios_ebda.h> @@ -66,6 +68,7 @@ struct x86_init_ops x86_init __initdata = { .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = e820__memory_setup_default, + .dmi_setup = dmi_setup, }, .mpparse = { @@ -132,10 +135,12 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; -static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; } -static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; } +static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; } +static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; } static bool enc_tlb_flush_required_noop(bool enc) { return false; } static bool enc_cache_flush_required_noop(void) { return false; } +static void enc_kexec_begin_noop(void) {} +static void enc_kexec_finish_noop(void) {} static bool is_private_mmio_noop(u64 addr) {return false; } struct x86_platform_ops x86_platform __ro_after_init = { @@ -159,6 +164,8 @@ struct x86_platform_ops x86_platform __ro_after_init = { .enc_status_change_finish = enc_status_change_finish_noop, .enc_tlb_flush_required = enc_tlb_flush_required_noop, .enc_cache_flush_required = enc_cache_flush_required_noop, + .enc_kexec_begin = enc_kexec_begin_noop, + .enc_kexec_finish = enc_kexec_finish_noop, }, }; |