diff options
Diffstat (limited to 'arch/x86/kernel')
61 files changed, 2223 insertions, 1980 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..05110c1097ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o ifdef CONFIG_FRAME_POINTER obj-y += unwind_frame.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8a5abaa7d453..4764fa56924d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -76,6 +76,7 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug * Hotplug side: @@ -88,6 +89,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; * ->ioapic_lock */ static DEFINE_MUTEX(acpi_ioapic_lock); +#endif /* -------------------------------------------------------------------------- Boot-time Configuration @@ -454,6 +456,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + acpi_penalize_sci_irq(bus_irq, trigger, polarity); /* * stash over-ride to indicate we've been here diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4fdf6230d93c..458da8509b75 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,8 +13,20 @@ #include <linux/spinlock.h> #include <asm/amd_nb.h> +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 + +/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +static DEFINE_MUTEX(smn_mutex); + static u32 *flush_words; +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; -EXPORT_SYMBOL(amd_nb_misc_ids); +EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, @@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { } }; -struct amd_northbridge_info amd_northbridges; -EXPORT_SYMBOL(amd_northbridges); +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); static struct pci_dev *next_northbridge(struct pci_dev *dev, const struct pci_device_id *ids) @@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, return dev; } +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} +EXPORT_SYMBOL_GPL(amd_df_indirect_read); + int amd_cache_northbridges(void) { u16 i = 0; struct amd_northbridge *nb; - struct pci_dev *misc, *link; + struct pci_dev *root, *misc, *link; - if (amd_nb_num()) + if (amd_northbridges.num) return 0; misc = NULL; @@ -74,15 +198,17 @@ int amd_cache_northbridges(void) if (!i) return -ENODEV; - nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; amd_northbridges.num = i; - link = misc = NULL; - for (i = 0; i != amd_nb_num(); i++) { + link = misc = root = NULL; + for (i = 0; i != amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, amd_root_ids); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = @@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res) { u32 address; u64 base, msr; - unsigned segn_busn_bits; + unsigned int segn_busn_bits; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return NULL; /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return NULL; address = MSR_FAM10H_MMIO_CONF_BASE; @@ -226,14 +352,14 @@ static void amd_cache_gart(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); if (!flush_words) { amd_northbridges.flags &= ~AMD_NB_GART; pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return; } - for (i = 0; i != amd_nb_num(); i++) + for (i = 0; i != amd_northbridges.num; i++) pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } @@ -246,18 +372,20 @@ void amd_flush_garts(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, flush_words[i] | 1); flushed++; } - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 88c657b057e2..bb47e5eacd44 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -48,7 +48,6 @@ #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/idle.h> #include <asm/mtrr.h> #include <asm/time.h> #include <asm/smp.h> @@ -894,11 +893,13 @@ void __init setup_boot_APIC_clock(void) /* Setup the lapic or request the broadcast */ setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } void setup_secondary_APIC_clock(void) { setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } /* @@ -2263,6 +2264,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { /* Should happen once for each apic */ WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->native_eoi_write = (*drv)->eoi_write; (*drv)->eoi_write = eoi_write; } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 48e6d84f173e..945e512a112a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,7 +48,6 @@ #include <linux/bootmem.h> #include <asm/irqdomain.h> -#include <asm/idle.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/cpu.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index aeef53ce93e1..35690a168cf7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -815,9 +815,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) l = li; } addr1 = (base << shift) + - f * (unsigned long)(1 << m_io); + f * (1ULL << m_io); addr2 = (base << shift) + - (l + 1) * (unsigned long)(1 << m_io); + (l + 1) * (1ULL << m_io); pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index c7364bd633e1..643818a7688b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ - cputime_t stime; + cputime_t stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { @@ -1042,8 +1042,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; - if (apm_bios_call(&call)) + if (apm_bios_call(&call)) { + if (!call.err) + return APM_NO_ERROR; return call.err; + } *status = call.ebx; *bat = call.ecx; if (apm_info.get_power_status_swabinminutes) { diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4a8697f7d4ef..33b63670bf09 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o +obj-y += bugs.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_X86_32) += bugs.o -obj-$(CONFIG_X86_64) += bugs_64.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b81fe2d63e15..71cae73a5076 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,10 @@ #include "cpu.h" +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c) smp_num_siblings = ((ebx >> 8) & 3) + 1; c->x86_max_cores /= smp_num_siblings; c->cpu_core_id = ebx & 0xff; + + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (cpuid_edx(0x80000006)) { + if (c->x86 == 0x17) { + /* + * LLC is at the core complex level. + * Core complex id is ApicId[3]. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } + } } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); node_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = node_id; } else return; @@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_AMD_DCM); cus_per_node = c->x86_max_cores / nodes_per_socket; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = node_id; - /* core id has to be in the [0 .. cores_per_node - 1] range */ c->cpu_core_id %= cus_per_node; } @@ -347,7 +367,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); - unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -357,18 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); - - /* - * Fix percpu cpu_llc_id here as LLC topology is different - * for Fam17h systems. - */ - if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) - return; - - socket_id = (c->apicid >> bits) - 1; - core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; - - per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } @@ -589,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); -} -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); +} static void init_amd_k8(struct cpuinfo_x86 *c) { @@ -774,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* 3DNow or LM implies PREFETCHW */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bd17db15a2c1..a44ef52184df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,15 +16,19 @@ #include <asm/msr.h> #include <asm/paravirt.h> #include <asm/alternative.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> void __init check_bugs(void) { identify_boot_cpu(); -#ifndef CONFIG_SMP - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + +#ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. * @@ -40,4 +44,18 @@ void __init check_bugs(void) alternative_instructions(); fpu__init_check_bugs(); +#else /* CONFIG_X86_64 */ + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +#endif } diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c deleted file mode 100644 index a972ac4c7e7d..000000000000 --- a/arch/x86/kernel/cpu/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> -#include <asm/cacheflush.h> - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bd910a7dd0a..729f92ba8224 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -979,6 +979,35 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c) } /* + * The physical to logical package id mapping is initialized from the + * acpi/mptables information. Make sure that CPUID actually agrees with + * that. + */ +static void sanitize_package_id(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int pkg, apicid, cpu = smp_processor_id(); + + apicid = apic->cpu_present_to_apicid(cpu); + pkg = apicid >> boot_cpu_data.x86_coreid_bits; + + if (apicid != c->initial_apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n", + cpu, apicid, c->initial_apicid); + c->initial_apicid = apicid; + } + if (pkg != c->phys_proc_id) { + pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n", + cpu, pkg, c->phys_proc_id); + c->phys_proc_id = pkg; + } + c->logical_proc_id = topology_phys_to_logical_pkg(pkg); +#else + c->logical_proc_id = 0; +#endif +} + +/* * This does the hard work of actually picking apart the CPU stuff... */ static void identify_cpu(struct cpuinfo_x86 *c) @@ -1103,8 +1132,7 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif - /* The boot/hotplug time assigment got cleared, restore it */ - c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id); + sanitize_package_id(c); } /* @@ -1144,7 +1172,6 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1162,51 +1189,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); } -struct msr_range { - unsigned min; - unsigned max; -}; - -static const struct msr_range msr_range_array[] = { - { 0x00000000, 0x00000418}, - { 0xc0000000, 0xc000040b}, - { 0xc0010000, 0xc0010142}, - { 0xc0011000, 0xc001103b}, -}; - -static void __print_cpu_msr(void) -{ - unsigned index_min, index_max; - unsigned index; - u64 val; - int i; - - for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { - index_min = msr_range_array[i].min; - index_max = msr_range_array[i].max; - - for (index = index_min; index < index_max; index++) { - if (rdmsrl_safe(index, &val)) - continue; - pr_info(" MSR%08x: %016llx\n", index, val); - } - } -} - -static int show_msr; - -static __init int setup_show_msr(char *arg) -{ - int num; - - get_option(&arg, &num); - - if (num > 0) - show_msr = num; - return 1; -} -__setup("show_msr=", setup_show_msr); - static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); @@ -1240,14 +1222,6 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(", stepping: 0x%x)\n", c->x86_mask); else pr_cont(")\n"); - - print_cpu_msr(c); -} - -void print_cpu_msr(struct cpuinfo_x86 *c) -{ - if (c->cpu_index < show_msr) - __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) @@ -1462,11 +1436,8 @@ void cpu_init(void) */ cr4_init_shadow(); - /* - * Load microcode on this cpu if a valid microcode is available. - * This is early microcode loading procedure. - */ - load_ucode_ap(); + if (cpu) + load_ucode_ap(); t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 631356c8cca4..c7efbcfbeda6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e *msg = s->msg; s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (panic_on_oops || tolerant < 1) + if (tolerant < 1) return MCE_PANIC_SEVERITY; } return s->sev; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7fdf453d895..132e1ec67da0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <linux/jump_label.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -135,6 +136,9 @@ void mce_setup(struct mce *m) m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + rdmsrl(MSR_PPIN, m->ppin); } DEFINE_PER_CPU(struct mce, injectm); @@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +static atomic_t num_notifiers; + void mce_register_decode_chain(struct notifier_block *nb) { + atomic_inc(&num_notifiers); + /* Ensure SRAO notifier has the highest priority in the decode chain. */ if (nb != &mce_srao_nb && nb->priority == INT_MAX) nb->priority -= 1; @@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { + atomic_dec(&num_notifiers); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = { .misc = misc_reg }; -static void print_mce(struct mce *m) +static void __print_mce(struct mce *m) { - int ret = 0; - - pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", - m->extcpu, m->mcgstatus, m->bank, m->status); + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); if (m->ip) { pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); @@ -308,6 +318,13 @@ static void print_mce(struct mce *m) pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); +} + +static void print_mce(struct mce *m) +{ + int ret = 0; + + __print_mce(m); /* * Print out human-readable details about the MCE error, @@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = { .priority = INT_MAX, }; +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* + * Run the default notifier if we have only the SRAO + * notifier and us registered. + */ + if (atomic_read(&num_notifiers) > 2) + return NOTIFY_DONE; + + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = 0, +}; + /* * Read ADDR and MISC registers. */ @@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); + /* + * m.tsc was set in mce_setup(). Clear it if not requested. + * + * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid + * that dance. + */ + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; - /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) @@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) add_timer_on(t, cpu); } +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_pinned_timer(t, mce_timer_fn, cpu); +} + static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); @@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_init_timer(); + __mcheck_cpu_setup_timer(); } /* @@ -2138,6 +2193,7 @@ int __init mcheck_init(void) { mcheck_intel_therm_init(); mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_process_work); @@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); @@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + dev = kzalloc(sizeof *dev, GFP_KERNEL); if (!dev) return -ENOMEM; @@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void mce_disable_cpu(void) { - unsigned long action = *(unsigned long *)h; - if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_clear(); vendor_disable_error_reporting(); } -static void mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void) { - unsigned long action = *(unsigned long *)h; int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_reenable(); for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; @@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h) } } -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); + int ret; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - mce_device_create(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_device_remove(cpu); - mce_intel_hcpu_update(cpu); + mce_device_create(cpu); - /* intentionally ignoring frozen here */ - if (!(action & CPU_TASKS_FROZEN)) - cmci_rediscover(); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - del_timer_sync(t); - break; - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - mce_start_timer(cpu, t); - break; + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; } - - return NOTIFY_OK; + mce_reenable_cpu(); + mce_start_timer(cpu, t); + return 0; } -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = &per_cpu(mce_timer, cpu); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} static __init void mce_init_banks(void) { @@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { + enum cpuhp_state hp_online; int err; - int i = 0; if (!mce_available(&boot_cpu_data)) { err = -EIO; @@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = mce_device_create(i); - if (err) { - /* - * Register notifier anyway (and do not unreg it) so - * that we don't leave undeleted timers, see notifier - * callback above. - */ - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - goto err_device_create; - } - } + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + hp_online = err; register_syscore_ops(&mce_syscore_ops); @@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); + cpuhp_remove_state(hp_online); -err_device_create: - /* - * We didn't keep track of which devices were created above, but - * even if we had, the set of online cpus might have changed. - * Play safe and remove for every possible cpu, since - * mce_device_remove() will do the right thing. - */ - for_each_possible_cpu(i) - mce_device_remove(i); +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); err_out_mem: free_cpumask_var(mce_device_initialized); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9b5403462936..ffacfdcacb85 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -24,7 +24,6 @@ #include <asm/amd_nb.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -55,6 +54,8 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 +static bool thresholding_en; + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; -struct smca_bank_name smca_bank_names[] = { +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, @@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = { [SMCA_PSP] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(smca_bank_names); -static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { +const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ /* ZN Core (HWID=0xB0) MCA types */ @@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -struct smca_bank_info smca_banks[MAX_NR_BANKS]; +struct smca_bank smca_banks[MAX_NR_BANKS]; EXPORT_SYMBOL_GPL(smca_banks); /* @@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -/* - * CPU Initialization - */ - static void get_smca_bank_info(unsigned int bank) { unsigned int i, hwid_mcatype, cpu = smp_processor_id(); - struct smca_hwid_mcatype *type; - u32 high, instanceId; - u16 hwid, mcatype; + struct smca_hwid *s_hwid; + u32 high, instance_id; /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } - hwid = high & MCI_IPID_HWID; - mcatype = (high & MCI_IPID_MCATYPE) >> 16; - hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { - type = &smca_hwid_mcatypes[i]; - if (hwid_mcatype == type->hwid_mcatype) { - smca_banks[bank].type = type; - smca_banks[bank].type_instance = instanceId; + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + + WARN(smca_banks[bank].hwid, + "Bank %s already initialized!\n", + smca_get_name(s_hwid->bank_type)); + + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = instance_id; break; } } @@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + static void __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { @@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + struct thresh_restart tr; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void) log: __log_error(bank, false, true, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); } /* @@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].type) + if (!smca_banks[bank].hwid) return NULL; - bank_type = smca_banks[bank].type->bank_type; + bank_type = smca_banks[bank].hwid->bank_type; if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) @@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) } snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_bank_names[bank_type].name, - smca_banks[bank].type_instance); + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].id); return buf_mcatype; } @@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -/* create dir/files for all valid threshold banks */ -static int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - struct threshold_bank **bp; - int err = 0; - - bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, - GFP_KERNEL); - if (!bp) - return -ENOMEM; - - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - return err; - } - - return err; -} - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -1102,48 +1304,71 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static void threshold_remove_device(unsigned int cpu) +int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; + if (!thresholding_en) + return 0; + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } kfree(per_cpu(threshold_banks, cpu)); + per_cpu(threshold_banks, cpu) = NULL; + return 0; } -/* get notified when a cpu comes on/off */ -static void -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +/* create dir/files for all valid threshold banks */ +int mce_threshold_create_device(unsigned int cpu) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + if (!thresholding_en) + return 0; + + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; + + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto err; } + return err; +err: + mce_threshold_remove_device(cpu); + return err; } static __init int threshold_init_device(void) { unsigned lcpu = 0; + if (mce_threshold_vector == amd_threshold_interrupt) + thresholding_en = true; + /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); + int err = mce_threshold_create_device(lcpu); if (err) return err; } - threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1defb8ea882c..190b3e6cef4d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,6 +11,8 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void) * Reset the counter if we've logged an error in the last poll * during the storm. */ - if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); else this_cpu_dec(cmci_backoff_cnt); @@ -342,7 +344,7 @@ void cmci_recheck(void) return; local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); } @@ -464,11 +466,46 @@ static void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN available but disabled: */ + return; + } + + /* If PPIN is disabled, but not locked, try to enable: */ + if (!(val & 3UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + if ((val & 3UL) == 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); + intel_ppin_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6b9dc4d18ccc..465aca8be009 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -26,7 +26,6 @@ #include <asm/processor.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int thermal_throttle_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - int err = 0; - - dev = get_cpu_device(cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = thermal_throttle_add_dev(dev, cpu); - WARN_ON(err); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - thermal_throttle_remove_dev(dev); - break; - } - return notifier_from_errno(err); + struct device *dev = get_cpu_device(cpu); + + return thermal_throttle_add_dev(dev, cpu); } -static struct notifier_block thermal_throttle_cpu_notifier = +static int thermal_throttle_offline(unsigned int cpu) { - .notifier_call = thermal_throttle_cpu_callback, -}; + struct device *dev = get_cpu_device(cpu); + + thermal_throttle_remove_dev(dev); + return 0; +} static __init int thermal_throttle_init_device(void) { - unsigned int cpu = 0; - int err; + int ret; if (!atomic_read(&therm_throt_en)) return 0; - cpu_notifier_register_begin(); - - /* connect live CPUs to sysfs */ - for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); - WARN_ON(err); - } - - __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); - cpu_notifier_register_done(); - - return 0; + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; } device_initcall(thermal_throttle_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fcf9ae9384f4..9beb092d68a5 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -6,7 +6,6 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 220b1a508513..ba12e8aa4a45 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,4 +1,4 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 620ab06bcf45..6f353bdb3a25 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -5,6 +5,7 @@ * CPUs and later. * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2016 Borislav Petkov <bp@alien8.de> * * Author: Peter Oruba <peter.oruba@amd.com> * @@ -39,64 +40,25 @@ static struct equiv_cpu_entry *equiv_cpu_table; -struct ucode_patch { - struct list_head plist; - void *data; - u32 patch_id; - u16 equiv_cpu; -}; - -static LIST_HEAD(pcache); - /* * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. */ -static u8 *container; -static size_t container_size; -static bool ucode_builtin; +struct container { + u8 *data; + size_t size; +} cont; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; -static struct cpio_data ucode_cpio; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - char *path; - void *start; - size_t size; - - /* - * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/early-microcode.txt - */ - static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif /* !CONFIG_X86_32 */ - - return find_cpio_data(path, start, size, NULL); -#else - return (struct cpio_data){ NULL, 0, "" }; -#endif -} +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; static size_t compute_container_size(u8 *data, u32 total_size) { @@ -135,48 +97,48 @@ static size_t compute_container_size(u8 *data, u32 total_size) return size; } +static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, + unsigned int sig) +{ + int i = 0; + + if (!equiv_cpu_table) + return 0; + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + /* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +static struct container +find_proper_container(u8 *ucode, size_t size, u16 *ret_id) { + struct container ret = { NULL, 0 }; + u32 eax, ebx, ecx, edx; struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; - patch = &amd_ucode_patch; -#endif + u16 eq_id = 0; + u32 *header; + u8 *data; data = ucode; left = size; header = (u32 *)data; + /* find equiv cpu table */ if (header[0] != UCODE_MAGIC || header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ header[2] == 0) /* size */ - return; + return ret; eax = 0x00000001; ecx = 0; @@ -185,7 +147,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) while (left > 0) { eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - *cont = data; + ret.data = data; /* Advance past the container header */ offset = header[2] + CONTAINER_HDR_SZ; @@ -194,15 +156,15 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) eq_id = find_equiv_id(eq, eax); if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); + ret.size = compute_container_size(ret.data, left + offset); /* * truncate how much we need to iterate over in the * ucode update loop below */ - left = *cont_sz - offset; - break; + left = ret.size - offset; + *ret_id = eq_id; + return ret; } /* @@ -212,6 +174,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) */ while (left > 0) { header = (u32 *)data; + if (header[0] == UCODE_MAGIC && header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) break; @@ -226,14 +189,64 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) ucode = data; } - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } + return ret; +} + +static int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static struct container +apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) +{ + struct container ret = { NULL, 0 }; + u8 (*patch)[PATCH_MAX_SIZE]; + int offset, left; + u32 rev, *header; + u8 *data; + u16 eq_id = 0; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + patch = &amd_ucode_patch; +#endif if (check_current_patch_level(&rev, true)) - return; + return (struct container){ NULL, 0 }; + + ret = find_proper_container(ucode, size, &eq_id); + if (!eq_id) + return (struct container){ NULL, 0 }; + + this_equiv_id = eq_id; + header = (u32 *)ret.data; + + /* We're pointing to an equiv table, skip over it. */ + data = ret.data + header[2] + CONTAINER_HDR_SZ; + left = ret.size - (header[2] + CONTAINER_HDR_SZ); while (left > 0) { struct microcode_amd *mc; @@ -252,8 +265,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) *new_rev = rev; if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -261,10 +273,10 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) data += offset; left -= offset; } + return ret; } -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { #ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -281,47 +293,45 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { + struct ucode_cpu_info *uci; struct cpio_data cp; - bool *builtin; - void **data; - size_t *size; + const char *path; + bool use_pa; -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); - builtin = (bool *)__pa_nodebug(&ucode_builtin); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; - builtin = &ucode_builtin; -#endif + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } - *builtin = load_builtin_amd_microcode(&cp, family); - if (!*builtin) - cp = find_ucode_in_initrd(); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd(path, use_pa); if (!(cp.data && cp.size)) return; - *data = cp.data; - *size = cp.size; + /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_eax(1); - apply_ucode_in_initrd(cp.data, cp.size, true); + apply_microcode_early_amd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 /* * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory. + * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP. + * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, * which is used upon resume from suspend. */ -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { struct microcode_amd *mc; - size_t *usize; - void **ucode; + struct cpio_data cp; mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { @@ -329,60 +339,63 @@ void load_ucode_amd_ap(void) return; } - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true); - if (!*ucode || !*usize) + if (!(cp.data && cp.size)) return; - apply_ucode_in_initrd(*ucode, *usize, false); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + /* + * This would set amd_ucode_patch above so that the following APs can + * use it directly instead of going down this path again. + */ + apply_microcode_early_amd(cp.data, cp.size, true); } #else -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { - unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u8 *cont = container; u32 rev, eax; u16 eq_id; - /* Exit if called on the BSP. */ - if (!cpu) + /* 64-bit runs with paging enabled, thus early==false. */ + if (check_current_patch_level(&rev, false)) return; - if (!container) - return; + /* First AP hasn't cached it yet, go through the blob. */ + if (!cont.data) { + struct cpio_data cp = { NULL, 0, "" }; - /* - * 64-bit runs with paging enabled, thus early==false. - */ - if (check_current_patch_level(&rev, false)) - return; + if (cont.size == -1) + return; - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; +reget: + if (!get_builtin_microcode(&cp, family)) { +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); +#endif + if (!(cp.data && cp.size)) { + /* + * Mark it so that other APs do not scan again + * for no real reason and slow down boot + * needlessly. + */ + cont.size = -1; + return; + } + } + + cont = apply_microcode_early_amd(cp.data, cp.size, false); + if (!(cont.data && cont.size)) { + cont.size = -1; + return; + } + } eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -397,61 +410,50 @@ void load_ucode_amd_ap(void) } } else { - if (!ucode_cpio.data) - return; /* * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); + goto reget; } } -#endif +#endif /* CONFIG_X86_32 */ + +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(void) +int __init save_microcode_in_initrd_amd(unsigned int fam) { - unsigned long cont; - int retval = 0; enum ucode_state ret; - u8 *cont_va; - u32 eax; + int retval = 0; + u16 eq_id; - if (!container) - return -EINVAL; + if (!cont.data) { + if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) { + struct cpio_data cp = { NULL, 0, "" }; -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa(container); - cont_va = container; +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); #endif - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; + if (!(cp.data && cp.size)) { + cont.size = -1; + return -EINVAL; + } - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + cont = find_proper_container(cp.data, cp.size, &eq_id); + if (!eq_id) { + cont.size = -1; + return -EINVAL; + } - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + } else + return -EINVAL; + } - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size); if (ret != UCODE_OK) retval = -EINVAL; @@ -459,8 +461,8 @@ int __init save_microcode_in_initrd_amd(void) * This will be freed any msec now, stash patches for the current * family and switch to patch cache for cpu hotplug, etc later. */ - container = NULL; - container_size = 0; + cont.data = NULL; + cont.size = 0; return retval; } @@ -478,8 +480,10 @@ void reload_ucode_amd(void) return; mc = (struct microcode_amd *)amd_ucode_patch; + if (!mc) + return; - if (mc && rev < mc->hdr.patch_id) { + if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; pr_info("reload patch_level=0x%08x\n", ucode_new_rev); @@ -513,7 +517,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) + list_for_each_entry(p, µcode_cache, plist) if (p->equiv_cpu == equiv_cpu) return p; return NULL; @@ -523,7 +527,7 @@ static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) { + list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { if (p->patch_id >= new_patch->patch_id) /* we already have the latest patch */ @@ -536,14 +540,14 @@ static void update_cache(struct ucode_patch *new_patch) } } /* no patch found, add it */ - list_add_tail(&new_patch->plist, &pcache); + list_add_tail(&new_patch->plist, µcode_cache); } static void free_cache(void) { struct ucode_patch *p, *tmp; - list_for_each_entry_safe(p, tmp, &pcache, plist) { + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { __list_del(p->plist.prev, p->plist.next); kfree(p->data); kfree(p); @@ -663,21 +667,7 @@ bool check_current_patch_level(u32 *rev, bool early) return ret; } -int __apply_microcode_amd(struct microcode_amd *mc_amd) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) - return -1; - - return 0; -} - -int apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -860,7 +850,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5ce5155f0695..6996413c78c3 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * 2006 Shaohua Li <shaohua.li@intel.com> - * 2013-2015 Borislav Petkov <bp@alien8.de> + * 2013-2016 Borislav Petkov <bp@alien8.de> * * X86 CPU microcode early update for Linux: * @@ -39,12 +39,15 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/setup.h> -#define MICROCODE_VERSION "2.01" +#define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr; +LIST_HEAD(microcode_cache); + /* * Synchronization. * @@ -167,7 +170,7 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - load_ucode_amd_ap(); + load_ucode_amd_ap(family); break; default: break; @@ -185,7 +188,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(c->x86); break; default: break; @@ -194,6 +197,58 @@ static int __init save_microcode_in_initrd(void) return -EINVAL; } +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Did we relocate the ramdisk? + * + * So we possibly relocate the ramdisk *after* applying microcode on the + * BSP so we rely on use_pa (use physical addresses) - even if it is not + * absolutely correct - to determine whether we've done the ramdisk + * relocation already. + */ + if (!use_pa && relocated_ramdisk) + start = initrd_start; + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif +} + void reload_early_microcode(void) { int vendor, family; @@ -453,16 +508,17 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - microcode_ops->microcode_fini_cpu(cpu); + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); } static enum ucode_state microcode_resume_cpu(int cpu) { - pr_debug("CPU%d updated upon resume\n", cpu); - if (apply_microcode_on_target(cpu)) return UCODE_ERROR; + pr_debug("CPU%d updated upon resume\n", cpu); + return UCODE_OK; } @@ -496,6 +552,9 @@ static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + /* Refresh CPU microcode revision after resume. */ + collect_cpu_info(cpu); + if (uci->valid) return microcode_resume_cpu(cpu); @@ -579,12 +638,7 @@ static int mc_cpu_down_prep(unsigned int cpu) /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); - /* - * When a CPU goes offline, don't free up or invalidate the copy of - * the microcode in kernel memory, so that we can reuse it when the - * CPU comes back online without unnecessarily requesting the userspace - * for it again. - */ + return 0; } @@ -649,8 +703,7 @@ int __init microcode_init(void) cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); return 0; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0deab00c9..54d50c3694d8 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,125 +39,83 @@ #include <asm/setup.h> #include <asm/msr.h> -/* - * Temporary microcode blobs pointers storage. We note here during early load - * the pointers to microcode blobs we've got from whatever storage (detached - * initrd, builtin). Later on, we put those into final storage - * mc_saved_data.mc_saved. - * - * Important: those are offsets from the beginning of initrd or absolute - * addresses within the kernel image when built-in. - */ -static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; - -static struct mc_saved_data { - unsigned int num_saved; - struct microcode_intel **mc_saved; -} mc_saved_data; +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Microcode blobs within the initrd. 0 if builtin. */ -static struct ucode_blobs { - unsigned long start; - bool valid; -} blobs; +/* Current microcode patch used in early patching */ +struct microcode_intel *intel_ucode_patch; -/* Go through saved patches and find the one suitable for the current CPU. */ -static enum ucode_state -find_microcode_patch(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } + if (s1 != s2) + return false; - if (!new_mc) - return UCODE_NFOUND; + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; + /* ... or they intersect. */ + return p1 & p2; } -static inline void -copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, - unsigned long off, int num_saved) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int find_matching_signature(void *mc, unsigned int csig, int cpf) { + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; int i; - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) -{ - int i; - struct microcode_intel ***mc_saved; + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; - mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; - for (i = 0; i < mcs->num_saved; i++) { - struct microcode_intel *p; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; } + return 0; } -#endif -static enum ucode_state -load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long offset, struct ucode_cpu_info *uci) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mcs->num_saved; + struct microcode_header_intel *mc_hdr = mc; - if (!mcs->mc_saved) { - copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); + if (mc_hdr->rev <= new_rev) + return 0; - return find_microcode_patch(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mcs); - return find_microcode_patch(mc_saved_tmp, count, uci); -#else - return find_microcode_patch(mcs->mc_saved, count, uci); -#endif - } + return find_matching_signature(mc, csig, cpf); } /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. + * + * %true - if there's a match + * %false - otherwise */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) +static bool microcode_matches(struct microcode_header_intel *mc_header, + unsigned long sig) { - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; + struct extended_sigtable *ext_header; + unsigned int fam_ucode, model_ucode; struct extended_signature *ext_sig; + unsigned int fam, model; + int ext_sigcount, i; fam = x86_family(sig); model = x86_model(sig); @@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(mc_header->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; + return false; ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(ext_sig->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; ext_sig++; } - return UCODE_NFOUND; + return false; } -static int -save_microcode(struct mc_saved_data *mcs, - struct microcode_intel **mc_saved_src, - unsigned int num_saved) +static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { - int i, j; - struct microcode_intel **saved_ptr; - int ret; + struct ucode_patch *p; - if (!num_saved) - return -EINVAL; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < num_saved; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return ERR_PTR(-ENOMEM); + } - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); + return p; +} - saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; +static void save_microcode_patch(void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (find_matching_signature(data, sig, pf)) { + prev_found = true; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer %p\n", data); + else + list_replace(&iter->plist, &p->plist); } } /* - * Point to newly saved microcode. + * There weren't any previous patches found in the list cache; save the + * newly found. */ - mcs->mc_saved = saved_ptr; - mcs->num_saved = num_saved; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; + if (!prev_found) { + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } } -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) +static int microcode_sanity_check(void *mc, int print_err) { - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format.\n"); + return -EINVAL; + } - found = 1; + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; /* - * Found an older ucode saved earlier. Replace it with - * this newer one. + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; } - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + if (!ext_table_size) + return 0; - return num_saved; + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; } /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. */ -static enum ucode_state __init -get_matching_model_microcode(unsigned long start, void *data, size_t size, - struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci) +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; struct microcode_header_intel *mc_header; - unsigned int num_saved = mcs->num_saved; - enum ucode_state state = UCODE_OK; - unsigned int leftover = size; - u8 *ucode_ptr = data; + struct microcode_intel *patch = NULL; unsigned int mc_size; - int i; - - while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { - if (leftover < sizeof(mc_header)) + while (size) { + if (size < sizeof(struct microcode_header_intel)) break; - mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_header = (struct microcode_header_intel *)data; mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) + if (!mc_size || + mc_size > size || + microcode_sanity_check(data, 0) < 0) break; - leftover -= mc_size; + size -= mc_size; - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { - ucode_ptr += mc_size; + if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + data += mc_size; continue; } - num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); + if (save) { + save_microcode_patch(data, mc_size); + goto next; + } - ucode_ptr += mc_size; - } - if (leftover) { - state = UCODE_ERROR; - return state; - } + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; - if (!num_saved) { - state = UCODE_NFOUND; - return state; - } + } else { + struct microcode_header_intel *phdr = &patch->hdr; - for (i = 0; i < num_saved; i++) - mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } + + /* We have a newer patch, save it. */ + patch = data; - mcs->num_saved = num_saved; +next: + data += mc_size; + } - return state; + if (size) + return NULL; + + return patch; } static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; unsigned int family, model; - struct cpu_signature csig; + struct cpu_signature csig = { 0 }; unsigned int eax, ebx, ecx, edx; - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - memset(uci, 0, sizeof(*uci)); eax = 0x00000001; @@ -374,8 +382,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - family = x86_family(csig.sig); - model = x86_model(csig.sig); + family = x86_family(eax); + model = x86_model(eax); if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ @@ -401,40 +409,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) static void show_saved_mc(void) { #ifdef DEBUG - int i, j; + int i = 0, j; unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; + struct ucode_patch *p; - if (!mc_saved_data.num_saved) { + if (list_empty(µcode_cache)) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.num_saved; i++) { + list_for_each_entry(p, µcode_cache, plist) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; - int ext_sigcount; struct extended_signature *ext_sig; + int ext_sigcount; + + mc_saved_header = (struct microcode_header_intel *)p->data; - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + date = mc_saved_header->date; + + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, + i++, sig, pf, rev, total_size, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -443,7 +452,7 @@ static void show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -456,85 +465,43 @@ static void show_saved_mc(void) ext_sig++; } - } #endif } /* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. + * Save this microcode patch. It will be loaded early when a CPU is + * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc, unsigned int size) { #ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int num_saved; - struct microcode_intel **mc_saved; - int ret, i; - mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.num_saved; - num_saved = mc_saved_data.num_saved; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && num_saved) - memcpy(mc_saved_tmp, mc_saved, - num_saved * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - num_saved = _save_mc(mc_saved_tmp, mc, num_saved); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - + save_microcode_patch(mc, size); show_saved_mc(); - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: mutex_unlock(&x86_cpu_microcode_mutex); #endif } -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +static bool load_builtin_intel_microcode(struct cpio_data *cp) { -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int eax = 1, ebx, ecx = 0, edx; char name[30]; + if (IS_ENABLED(CONFIG_X86_32)) + return false; + native_cpuid(&eax, &ebx, &ecx, &edx); sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); return get_builtin_firmware(cp, name); -#else - return false; -#endif } /* @@ -570,8 +537,7 @@ void show_ucode_info_early(void) } /* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in + * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ static void print_ucode(struct ucode_cpu_info *uci) @@ -648,206 +614,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return 0; } -/* - * This function converts microcode patch offsets previously stored in - * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. - */ int __init save_microcode_in_initrd_intel(void) { - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data.num_saved; - unsigned long offset = 0; - int ret; - - if (!count) - return 0; + struct ucode_cpu_info uci; + struct cpio_data cp; /* - * We have found a valid initrd but it might've been relocated in the - * meantime so get its updated address. + * AP loading didn't find any microcode patch, no need to save anything. */ - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) - offset = initrd_start; - - copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); + if (!intel_ucode_patch || IS_ERR(intel_ucode_patch)) + return 0; - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - else - show_saved_mc(); + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); - return ret; -} + if (!(cp.data && cp.size)) + return 0; -static __init enum ucode_state -__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) -{ -#ifdef CONFIG_BLK_DEV_INITRD - static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; - char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) - : ucode_name; -# ifdef CONFIG_X86_32 - unsigned long start = 0, size; - struct boot_params *params; + collect_cpu_info_early(&uci); - params = (struct boot_params *)__pa_nodebug(&boot_params); - size = params->hdr.ramdisk_size; + scan_microcode(cp.data, cp.size, &uci, true); - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? params->hdr.ramdisk_image : 0); + show_saved_mc(); -# else /* CONFIG_X86_64 */ - unsigned long start = 0, size; + return 0; +} - size = (u64)boot_params.ext_ramdisk_size << 32; - size |= boot_params.hdr.ramdisk_size; - if (size) { - start = (u64)boot_params.ext_ramdisk_image << 32; - start |= boot_params.hdr.ramdisk_image; +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +{ + static const char *path; + struct cpio_data cp; + bool use_pa; - start += PAGE_OFFSET; + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; } -# endif - - *cd = find_cpio_data(p, (void *)start, size, NULL); - if (cd->data) { - blbp->start = start; - blbp->valid = true; - return UCODE_OK; - } else -#endif /* CONFIG_BLK_DEV_INITRD */ - return UCODE_ERROR; -} + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci, struct ucode_blobs *blbp) -{ - struct cpio_data cd = { NULL, 0, "" }; - enum ucode_state ret; + if (!(cp.data && cp.size)) + return NULL; - /* try built-in microcode first */ - if (load_builtin_intel_microcode(&cd)) - /* - * Invalidate blobs as we might've gotten an initrd too, - * supplied by the boot loader, by mistake or simply forgotten - * there. That's fine, we ignore it since we've found builtin - * microcode already. - */ - blbp->valid = false; - else { - ret = __scan_microcode_initrd(&cd, blbp); - if (ret != UCODE_OK) - return ret; - } + collect_cpu_info_early(uci); - return get_matching_model_microcode(blbp->start, cd.data, cd.size, - mcs, mc_ptrs, uci); + return scan_microcode(cp.data, cp.size, uci, false); } -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_blobs *blbp) +void __init load_ucode_intel_bsp(void) { + struct microcode_intel *patch; struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); - if (ret != UCODE_OK) + patch = __load_ucode_intel(&uci); + if (!patch) return; - ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); - if (ret != UCODE_OK) - return; + uci.mc = patch; apply_microcode_early(&uci, true); } -void __init load_ucode_intel_bsp(void) +void load_ucode_intel_ap(void) { - struct ucode_blobs *blobs_p; - struct mc_saved_data *mcs; - unsigned long *ptrs; + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + +reget: + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } - _load_ucode_intel_bsp(mcs, ptrs, blobs_p); + uci.mc = *iup; + + if (apply_microcode_early(&uci, true)) { + /* Mixed-silicon system? Try to refetch the proper patch: */ + *iup = NULL; + + goto reget; + } } -void load_ucode_intel_ap(void) +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) { - struct ucode_blobs *blobs_p; - unsigned long *ptrs, start = 0; - struct mc_saved_data *mcs; - struct ucode_cpu_info uci; - enum ucode_state ret; + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (!mcs->num_saved) - return; + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - if (blobs_p->valid) { - start = blobs_p->start; + phdr = (struct microcode_header_intel *)iter->data; - /* - * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles - * physmem mapping too and there we have the initrd. - */ - start += PAGE_OFFSET - __PAGE_OFFSET_BASE; - } + if (phdr->rev <= uci->cpu_sig.rev) + continue; - collect_cpu_info_early(&uci); - ret = load_microcode(mcs, ptrs, start, &uci); - if (ret != UCODE_OK) - return; + if (!find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; - apply_microcode_early(&uci, true); + return iter->data; + } + return NULL; } void reload_ucode_intel(void) { + struct microcode_intel *p; struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.num_saved) - return; collect_cpu_info_early(&uci); - ret = find_microcode_patch(mc_saved_data.mc_saved, - mc_saved_data.num_saved, &uci); - if (ret != UCODE_OK) + p = find_patch(&uci); + if (!p) return; + uci.mc = p; + apply_microcode_early(&uci, false); } @@ -879,24 +779,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -/* - * return 0 - no update found - * return 1 - found update - */ -static int get_matching_mc(struct microcode_intel *mc, int cpu) -{ - struct cpu_signature cpu_sig; - unsigned int csig, cpf, crev; - - collect_cpu_info(cpu, &cpu_sig); - - csig = cpu_sig.sig; - cpf = cpu_sig.pf; - crev = cpu_sig.rev; - - return has_newer_microcode(mc, csig, cpf, crev); -} - static int apply_microcode_intel(int cpu) { struct microcode_intel *mc; @@ -911,16 +793,12 @@ static int apply_microcode_intel(int cpu) uci = ucode_cpu_info + cpu; mc = uci->mc; - if (!mc) - return 0; - - /* - * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc when it is newer than the one on this - * CPU. - */ - if (!get_matching_mc(mc, cpu)) - return 0; + if (!mc) { + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) + return 0; + } /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -962,7 +840,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - enum ucode_state state = UCODE_OK; unsigned int curr_mc_size = 0; unsigned int csig, cpf; @@ -1015,14 +892,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (leftover) { vfree(new_mc); - state = UCODE_ERROR; - goto out; + return UCODE_ERROR; } - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } + if (!new_mc) + return UCODE_NFOUND; vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; @@ -1032,12 +906,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc); + save_mc_for_early(new_mc, curr_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); -out: - return state; + + return UCODE_OK; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -1081,20 +955,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size) return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - vfree(uci->mc); - uci->mc = NULL; -} - static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, - .microcode_fini_cpu = microcode_fini_cpu, }; struct microcode_ops * __init init_intel_microcode(void) @@ -1109,4 +974,3 @@ struct microcode_ops * __init init_intel_microcode(void) return µcode_intel_ops; } - diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c deleted file mode 100644 index 406cb6c0d9dd..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ -#include <linux/firmware.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include <asm/microcode_intel.h> -#include <asm/processor.h> -#include <asm/msr.h> - -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - -int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) -{ - struct microcode_header_intel *mc_hdr = mc; - - if (mc_hdr->rev <= new_rev) - return 0; - - return find_matching_signature(mc, csig, cpf); -} diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f44c5a50ab8..6c044543545e 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -25,7 +25,6 @@ #include <asm/hyperv.h> #include <asm/mshyperv.h> #include <asm/desc.h> -#include <asm/idle.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 1db8dc490b66..d1316f9c8329 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,11 +17,17 @@ struct cpuid_bit { u32 sub_leaf; }; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX +/* Please keep the leaf sorted by cpuid_bit.level for faster search. */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { 0, 0, 0, 0, 0 } }; void init_scattered_cpuid_features(struct cpuinfo_x86 *c) @@ -30,18 +36,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, - { 0, 0, 0, 0, 0 } - }; - for (cb = cpuid_bits; cb->feature; cb++) { /* Verify that the level is valid */ @@ -50,10 +44,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); } } + +u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, + enum cpuid_regs_idx reg) +{ + const struct cpuid_bit *cb; + u32 cpuid_val = 0; + + for (cb = cpuid_bits; cb->feature; cb++) { + + if (level > cb->level) + continue; + + if (level < cb->level) + break; + + if (reg == cb->reg && sub_leaf == cb->sub_leaf) { + if (cpu_has(&boot_cpu_data, cb->feature)) + cpuid_val |= BIT(cb->bit); + } + } + + return cpuid_val; +} +EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 5130985b758b..891f4dad7b2c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,11 +24,16 @@ #include <linux/dmi.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/clocksource.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> +#include <asm/timer.h> + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -48,6 +53,8 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +static unsigned long vmware_tsc_khz __ro_after_init; + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -57,35 +64,80 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz, lpj; - uint32_t eax, ebx, ecx, edx; + return vmware_tsc_khz; +} - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static int vmw_sched_clock __initdata = 1; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", - (unsigned long) tsc_hz / 1000, - (unsigned long) tsc_hz % 1000); - - if (!preset_lpj) { - lpj = ((u64)tsc_hz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; - } +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; - return tsc_hz; + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; } +static void __init vmware_sched_clock_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); +} + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) + vmware_sched_clock_setup(); +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ lapic_timer_frequency = ecx / HZ; @@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + vmware_paravirt_ops_setup(); + #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2836de390f95..0931a105ffe1 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -45,10 +45,7 @@ #include <asm/msr.h> static struct class *cpuid_class; - -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; +static enum cpuhp_state cpuhp_cpuid_state; static void cpuid_smp_cpuid(void *cmd_block) { @@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_device_create(int cpu) +static int cpuid_device_create(unsigned int cpu) { struct device *dev; @@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void cpuid_device_destroy(int cpu) +static int cpuid_device_destroy(unsigned int cpu) { device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; } -static int cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = cpuid_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - cpuid_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block cpuid_class_cpu_notifier = -{ - .notifier_call = cpuid_class_cpu_callback, -}; - static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); @@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) static int __init cpuid_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } cpuid_class = class_create(THIS_MODULE, "cpuid"); if (IS_ERR(cpuid_class)) { @@ -177,45 +149,28 @@ static int __init cpuid_init(void) } cpuid_class->devnode = cpuid_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = cpuid_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; - err = 0; - goto out; + cpuhp_cpuid_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) { - cpuid_device_destroy(i); - } - cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); -out: return err; } +module_init(cpuid_init); static void __exit cpuid_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - cpuid_device_destroy(cpu); + cpuhp_remove_state(cpuhp_cpuid_state); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(cpuid_init); module_exit(cpuid_exit); MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9b7cf5c28f5f..0cfd01d2754c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,7 +22,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -46,14 +45,7 @@ static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { touch_nmi_watchdog(); - printk("%s [<%p>] %s%pB\n", - log_lvl, (void *)address, reliable ? "" : "? ", - (void *)address); -} - -void printk_address(unsigned long address) -{ - pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -67,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); /* * Iterate through the stacks, starting with the current stack pointer. @@ -82,8 +75,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (; stack; stack = stack_info.next_sp) { - const char *str_begin, *str_end; + for (regs = NULL; stack; stack = stack_info.next_sp) { + const char *stack_name; /* * If we overflowed the task stack into a guard page, jump back @@ -95,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - stack_type_str(stack_info.type, &str_begin, &str_end); - if (str_begin) - printk("%s <%s> ", log_lvl, str_begin); + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); /* * Scan the stack, printing any text addresses we find. At the @@ -112,13 +105,22 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, for (; stack < stack_info.end; stack++) { unsigned long real_addr; int reliable = 0; - unsigned long addr = *stack; + unsigned long addr = READ_ONCE_NOCHECK(*stack); unsigned long *ret_addr_p = unwind_get_return_address_ptr(&state); if (!__kernel_text_address(addr)) continue; + /* + * Don't print regs->ip again if it was already printed + * by __show_regs() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); + continue; + } + if (stack == ret_addr_p) reliable = 1; @@ -146,10 +148,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * of the addresses will just be printed as unreliable. */ unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) + __show_regs(regs, 0); } - if (str_end) - printk("%s <%s> ", log_lvl, str_end); + if (stack_name) + printk("%s </%s>\n", log_lvl, stack_name); } } @@ -164,12 +171,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, ""); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, ""); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -261,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err) sp = kernel_stack_pointer(regs); savesegment(ss, ss); } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", + (void *)regs->ip, ss, sp); #else /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip); - printk(" RSP <%016lx>\n", regs->sp); + printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); #endif return 0; } @@ -291,22 +295,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..bb3b5b9a6899 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,18 +16,15 @@ #include <asm/stacktrace.h> -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { - switch (type) { - case STACK_TYPE_IRQ: - case STACK_TYPE_SOFTIRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + return NULL; } static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) @@ -109,8 +106,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -121,36 +120,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +137,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..fac189efcc34 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - switch (type) { - case STACK_TYPE_IRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; - *end = "EOE"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; } static bool in_exception_stack(unsigned long *stack, struct stack_info *info) @@ -128,8 +122,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -140,56 +136,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" <EOI> "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +153,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index aad34aafc0e0..d913047f832c 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -23,17 +23,12 @@ static double __initdata y = 3145727.0; */ void __init fpu__init_check_bugs(void) { - u32 cr0_saved; s32 fdiv_bug; /* kernel_fpu_begin/end() relies on patched alternative instructions. */ if (!boot_cpu_has(X86_FEATURE_FPU)) return; - /* We might have CR0::TS set already, clear it: */ - cr0_saved = read_cr0(); - write_cr0(cr0_saved & ~X86_CR0_TS); - kernel_fpu_begin(); /* @@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void) kernel_fpu_end(); - write_cr0(cr0_saved); - if (fdiv_bug) { set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); pr_warn("Hmm, FPU with FDIV bug\n"); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 47004010ad5d..e4e97a5355ce 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void) return this_cpu_read(in_kernel_fpu); } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ static bool interrupted_kernel_fpu_idle(void) { - if (kernel_fpu_disabled()) - return false; - - if (use_eager_fpu()) - return true; - - return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); + return !kernel_fpu_disabled(); } /* @@ -125,8 +107,7 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); + __cpu_invalidate_fpregs_state(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -137,8 +118,6 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); kernel_fpu_enable(); } @@ -159,35 +138,6 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * CR0::TS save/restore functions: - */ -int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(irq_ts_save); - -void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} -EXPORT_SYMBOL_GPL(irq_ts_restore); - -/* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. @@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { - if (use_eager_fpu()) - copy_kernel_to_fpregs(&fpu->state); - else - fpregs_deactivate(fpu); + copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); @@ -247,7 +194,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -260,8 +206,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ - if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -283,10 +228,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - if (use_eager_fpu()) - copy_kernel_to_fpregs(&src_fpu->state); - else - fpregs_deactivate(src_fpu); + copy_kernel_to_fpregs(&src_fpu->state); } preempt_enable(); @@ -366,7 +308,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) if (fpu->fpstate_active) { /* Invalidate any lazy state: */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); @@ -409,7 +351,7 @@ void fpu__current_fpstate_write_begin(void) * ensures we will not be lazy and skip a XRSTOR in the * future. */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } /* @@ -459,7 +401,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); - fpu->counter++; trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } @@ -477,7 +418,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ @@ -521,14 +461,14 @@ void fpu__clear(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ - if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) { - /* FPU state will be reallocated lazily at the first use. */ - fpu__drop(fpu); - } else { - if (!fpu->fpstate_active) { - fpu__activate_curr(fpu); - user_fpu_begin(); - } + fpu__drop(fpu); + + /* + * Make sure fpstate is cleared and initialized. + */ + if (static_cpu_has(X86_FEATURE_FPU)) { + fpu__activate_curr(fpu); + user_fpu_begin(); copy_init_fpstate_to_fpregs(); } } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2f2b8c7ccb85..60dece392b3a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -10,18 +10,6 @@ #include <linux/init.h> /* - * Initialize the TS bit in CR0 according to the style of context-switches - * we are using: - */ -static void fpu__init_cpu_ctx_switch(void) -{ - if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) - stts(); - else - clts(); -} - -/* * Initialize the registers found in all CPUs, CR0 and CR4: */ static void fpu__init_cpu_generic(void) @@ -58,7 +46,6 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); - fpu__init_cpu_ctx_switch(); } /* @@ -233,82 +220,16 @@ static void __init fpu__init_system_xstate_size_legacy(void) } /* - * FPU context switching strategies: - * - * Against popular belief, we don't do lazy FPU saves, due to the - * task migration complications it brings on SMP - we only do - * lazy FPU restores. - * - * 'lazy' is the traditional strategy, which is based on setting - * CR0::TS to 1 during context-switch (instead of doing a full - * restore of the FPU state), which causes the first FPU instruction - * after the context switch (whenever it is executed) to fault - at - * which point we lazily restore the FPU state into FPU registers. - * - * Tasks are of course under no obligation to execute FPU instructions, - * so it can easily happen that another context-switch occurs without - * a single FPU instruction being executed. If we eventually switch - * back to the original task (that still owns the FPU) then we have - * not only saved the restores along the way, but we also have the - * FPU ready to be used for the original task. - * - * 'lazy' is deprecated because it's almost never a performance win - * and it's much more complicated than 'eager'. - * - * 'eager' switching is by default on all CPUs, there we switch the FPU - * state during every context switch, regardless of whether the task - * has used FPU instructions in that time slice or not. This is done - * because modern FPU context saving instructions are able to optimize - * state saving and restoration in hardware: they can detect both - * unused and untouched FPU state and optimize accordingly. - * - * [ Note that even in 'lazy' mode we might optimize context switches - * to use 'eager' restores, if we detect that a task is using the FPU - * frequently. See the fpu->counter logic in fpu/internal.h for that. ] - */ -static enum { ENABLE, DISABLE } eagerfpu = ENABLE; - -/* * Find supported xfeatures based on cpu features and command-line input. * This must be called after fpu__init_parse_early_param() is called and * xfeatures_mask is enumerated. */ u64 __init fpu__get_supported_xfeatures_mask(void) { - /* Support all xfeatures known to us */ - if (eagerfpu != DISABLE) - return XCNTXT_MASK; - - /* Warning of xfeatures being disabled for no eagerfpu mode */ - if (xfeatures_mask & XFEATURE_MASK_EAGER) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XFEATURE_MASK_EAGER); - } - - /* Return a mask that masks out all features requiring eagerfpu mode */ - return ~XFEATURE_MASK_EAGER; + return XCNTXT_MASK; } -/* - * Disable features dependent on eagerfpu. - */ -static void __init fpu__clear_eager_fpu_features(void) -{ - setup_clear_cpu_cap(X86_FEATURE_MPX); -} - -/* - * Pick the FPU context switching strategy: - * - * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of - * the following is true: - * - * (1) the cpu has xsaveopt, as it has the optimization and doing eager - * FPU switching has a relatively low cost compared to a plain xsave; - * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU - * switching. Should the kernel boot with noxsaveopt, we support MPX - * with eager FPU switching at a higher cost. - */ +/* Legacy code to initialize eager fpu mode. */ static void __init fpu__init_system_ctx_switch(void) { static bool on_boot_cpu __initdata = 1; @@ -317,17 +238,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - - if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XFEATURE_MASK_EAGER) - eagerfpu = ENABLE; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } /* @@ -336,11 +246,6 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { - eagerfpu = DISABLE; - fpu__clear_eager_fpu_features(); - } - if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -375,14 +280,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) */ fpu__init_cpu(); - /* - * But don't leave CR0::TS set yet, as some of the FPU setup - * methods depend on being able to execute FPU instructions - * that will fault on a set TS, such as the FXSAVE in - * fpu__init_system_mxcsr(). - */ - clts(); - fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..83c23c230b4c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(fpu); - preempt_enable(); - } + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 095ef7ddd6ae..1d7770447b3e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); @@ -73,6 +74,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); @@ -890,15 +892,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* - * For most XSAVE components, this would be an arduous task: - * brining fpstate up to date with fpregs, updating fpstate, - * then re-populating fpregs. But, for components that are - * never lazily managed, we can just access the fpregs - * directly. PKRU is never managed lazily, so we can just - * manipulate it directly. Make sure it stays that way. - */ - WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b6b2f0264af3..4e8577d03372 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -63,6 +63,8 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +#define SIZEOF_PTREGS 17*4 + /* * Number of possible pages in the lowmem region. * @@ -248,19 +250,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20); #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) - jb default_entry + jb .Ldefault_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax - jae bad_subarch + jae .Lbad_subarch movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax -bad_subarch: +.Lbad_subarch: WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really @@ -270,14 +272,14 @@ WEAK(xen_entry) __INITDATA subarch_entries: - .long default_entry /* normal x86/PC */ + .long .Ldefault_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ + .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #else - jmp default_entry + jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_HOTPLUG_CPU @@ -289,7 +291,8 @@ num_subarch_entries = (. - subarch_entries) / 4 ENTRY(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp - jmp *(initial_code) + call *(initial_code) +1: jmp 1b ENDPROC(start_cpu0) #endif @@ -317,7 +320,7 @@ ENTRY(startup_32_smp) call load_ucode_ap #endif -default_entry: +.Ldefault_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ X86_CR0_PG) @@ -347,7 +350,7 @@ default_entry: pushfl popl %eax # get EFLAGS testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? - jz enable_paging # hw disallowed setting of ID bit + jz .Lenable_paging # hw disallowed setting of ID bit # which means no CPUID and no CR4 xorl %eax,%eax @@ -357,13 +360,13 @@ default_entry: movl $1,%eax cpuid andl $~1,%edx # Ignore CPUID.FPU - jz enable_paging # No flags or only CPUID.FPU = no CR4 + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz enable_paging + jz .Lenable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -371,7 +374,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja enable_paging + ja .Lenable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -380,7 +383,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc enable_paging + jnc .Lenable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -390,7 +393,7 @@ default_entry: /* Make changes effective */ wrmsr -enable_paging: +.Lenable_paging: /* * Enable paging @@ -419,7 +422,7 @@ enable_paging: */ movb $4,X86 # at least 486 cmpl $-1,X86_CPUID - je is486 + je .Lis486 /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -430,7 +433,7 @@ enable_paging: movl %ecx,X86_VENDOR_ID+8 # last 4 chars orl %eax,%eax # do we have processor info as well? - je is486 + je .Lis486 movl $1,%eax # Use the CPUID instruction to get CPU type cpuid @@ -444,7 +447,7 @@ enable_paging: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: +.Lis486: movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET @@ -470,8 +473,9 @@ is486: xorl %eax,%eax # Clear LDT lldt %ax - pushl $0 # fake return address for unwinder - jmp *(initial_code) + call *(initial_code) +1: jmp 1b +ENDPROC(startup_32_smp) #include "verify_cpu.S" @@ -665,14 +669,17 @@ __PAGE_ALIGNED_BSS initial_pg_pmd: .fill 1024*KPMDS,4,0 #else -ENTRY(initial_page_table) +.globl initial_page_table +initial_page_table: .fill 1024,4,0 #endif initial_pg_fixmap: .fill 1024,4,0 -ENTRY(empty_zero_page) +.globl empty_zero_page +empty_zero_page: .fill 4096,1,0 -ENTRY(swapper_pg_dir) +.globl swapper_pg_dir +swapper_pg_dir: .fill 1024,4,0 EXPORT_SYMBOL(empty_zero_page) @@ -706,7 +713,12 @@ ENTRY(initial_page_table) .data .balign 4 ENTRY(initial_stack) - .long init_thread_union+THREAD_SIZE + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ + TOP_OF_KERNEL_STACK_PADDING; __INITRODATA int_msg: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..90de28841242 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,13 +66,8 @@ startup_64: * tables and then reload them. */ - /* - * Setup stack for verify_cpu(). "-8" because initial_stack is defined - * this way, see below. Our best guess is a NULL ptr for stack - * termination heuristics and we don't want to break anything which - * might depend on it (kgdb, ...). - */ - leaq (__end_init_task - 8)(%rip), %rsp + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp /* Sanitize CPU configuration */ call verify_cpu @@ -117,20 +112,20 @@ startup_64: movq %rdi, %rax shrq $PGDIR_SHIFT, %rax - leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx movq %rdx, 0(%rbx,%rax,8) movq %rdx, 8(%rbx,%rax,8) - addq $4096, %rdx + addq $PAGE_SIZE, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) incl %eax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) - addq $8192, %rbx + addq $PAGE_SIZE * 2, %rbx movq %rdi, %rax shrq $PMD_SHIFT, %rdi addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax @@ -147,6 +142,9 @@ startup_64: decl %ecx jnz 1b + test %rbp, %rbp + jz .Lskip_fixup + /* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated @@ -154,9 +152,9 @@ startup_64: * beyond _end. */ leaq level2_kernel_pgt(%rip), %rdi - leaq 4096(%rdi), %r8 + leaq PAGE_SIZE(%rdi), %r8 /* See if it is a valid page table entry */ -1: testb $1, 0(%rdi) +1: testb $_PAGE_PRESENT, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ @@ -167,6 +165,7 @@ startup_64: /* Fixup phys_base */ addq %rbp, phys_base(%rip) +.Lskip_fixup: movq $(early_level4_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) @@ -265,13 +264,17 @@ ENTRY(secondary_startup_64) movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx - wrmsr + wrmsr /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - - /* Finally jump to run C code and to be on real kernel address + jmp start_cpu +ENDPROC(secondary_startup_64) + +ENTRY(start_cpu) + /* + * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this @@ -295,12 +298,13 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder + call 1f # put return address on stack for unwinder +1: xorq %rbp, %rbp # clear frame pointer + movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq -ENDPROC(secondary_startup_64) +ENDPROC(start_cpu) #include "verify_cpu.S" @@ -308,15 +312,11 @@ ENDPROC(secondary_startup_64) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary(). + * start_secondary() via start_cpu(). */ ENTRY(start_cpu0) - movq initial_stack(%rip),%rsp - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + movq initial_stack(%rip), %rsp + jmp start_cpu ENDPROC(start_cpu0) #endif @@ -328,7 +328,11 @@ ENDPROC(start_cpu0) GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) - .quad init_thread_union+THREAD_SIZE-8 + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA bad_address: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..7c6e9ffe4424 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -14,7 +14,6 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 9ebd0b0e73d9..6b0678a541e2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -16,7 +16,6 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <asm/io_apic.h> -#include <asm/idle.h> #include <asm/apic.h> int sysctl_panic_on_stackoverflow; diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000000..cb9c1ed1d391 --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,215 @@ +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/nodemask.h> + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static unsigned int zero; +static unsigned int one = 1; +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = &zero, + .extra2 = &one, + }, + {} +}; + +static struct ctl_table itmt_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = itmt_kern_table, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl_table(itmt_root_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + if (sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of cpu core + * @core_cpu: The cpu number associated with the core + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPU with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int core_cpu) +{ + int cpu, i = 1; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + int smt_prio; + + /* + * Ensure that the siblings are moved to the end + * of the priority chain and only used when + * all other high priority cpus are out of capacity. + */ + smt_prio = prio * smp_num_siblings / i; + per_cpu(sched_core_priority, cpu) = smt_prio; + i++; + } +} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..36bc66416021 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,7 +42,6 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> -#include <asm/idle.h> #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> @@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); rcu_irq_exit(); break; @@ -308,7 +305,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { /** * This relies on __test_and_clear_bit to modify the memory @@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic_write(APIC_EOI, APIC_EOI_ACK); + apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); } static void kvm_guest_cpu_init(void) @@ -592,6 +589,14 @@ out: local_irq_restore(flags); } +__visible bool __kvm_vcpu_is_preempted(int cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!src->preempted; +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -608,6 +613,11 @@ void __init kvm_spinlock_init(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + pv_lock_ops.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6707039b9032..f09df2ff1bcc 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm) } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int size) { struct ldt_struct *new_ldt; - int alloc_size; + unsigned int alloc_size; if (size > LDT_ENTRIES) return NULL; @@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int oldsize, newsize; + struct user_desc ldt_info; struct desc_struct ldt; int error; - struct user_desc ldt_info; - int oldsize, newsize; - struct ldt_struct *new_ldt, *old_ldt; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) old_ldt = mm->context.ldt; oldsize = old_ldt ? old_ldt->size : 0; - newsize = max((int)(ldt_info.entry_number + 1), oldsize); + newsize = max(ldt_info.entry_number + 1, oldsize); error = -ENOMEM; new_ldt = alloc_ldt_struct(newsize); diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index efe73aacf966..7b0d3da52fb4 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -18,8 +18,10 @@ #ifdef CC_USING_FENTRY # define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) #else # define function_hook mcount +EXPORT_SYMBOL(mcount) #endif /* All cases save the original rbp (8 bytes) */ @@ -295,7 +297,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -EXPORT_SYMBOL(function_hook) #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7f3550acde1b..f5e3ff835cc8 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -44,6 +44,7 @@ #include <asm/msr.h> static struct class *msr_class; +static enum cpuhp_state cpuhp_msr_state; static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -180,7 +181,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int msr_device_create(int cpu) +static int msr_device_create(unsigned int cpu) { struct device *dev; @@ -189,34 +190,12 @@ static int msr_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void msr_device_destroy(int cpu) +static int msr_device_destroy(unsigned int cpu) { device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; } -static int msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = msr_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - msr_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block __refdata msr_class_cpu_notifier = { - .notifier_call = msr_class_cpu_callback, -}; - static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); @@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode) static int __init msr_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { pr_err("unable to get major %d for msr\n", MSR_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } msr_class = class_create(THIS_MODULE, "msr"); if (IS_ERR(msr_class)) { @@ -239,44 +216,28 @@ static int __init msr_init(void) } msr_class->devnode = msr_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = msr_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); - - err = 0; - goto out; + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) - msr_device_destroy(i); - cpu_notifier_register_done(); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); -out: return err; } +module_init(msr_init); static void __exit msr_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - msr_device_destroy(cpu); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - __unregister_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(msr_init); module_exit(msr_exit) MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 2c55a003b793..6d4bf812af45 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); } - PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) @@ -21,12 +20,25 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } +__visible bool __native_vcpu_is_preempted(int cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_lock_ops.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, + .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bbf3d5933eaa..a1bfba0f7234 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, - .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 920c6ae08592..d33ef165b1f8 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { @@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index bb3840cedb4f..f4fcf26c9fce 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) @@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0888a879120f..43c36d8a6ae2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -23,7 +23,6 @@ #include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> -#include <asm/idle.h> #include <asm/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> @@ -65,23 +64,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -#ifdef CONFIG_X86_64 -static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); -#endif - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -251,39 +233,9 @@ static inline void play_dead(void) } #endif -#ifdef CONFIG_X86_64 -void enter_idle(void) -{ - this_cpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} -#endif - void arch_cpu_idle_enter(void) { local_touch_nmi(); - enter_idle(); -} - -void arch_cpu_idle_exit(void) -{ - __exit_idle(); } void arch_cpu_idle_dead(void) @@ -336,59 +288,33 @@ void stop_this_cpu(void *dummy) halt(); } -bool amd_e400_c1e_detected; -EXPORT_SYMBOL(amd_e400_c1e_detected); - -static cpumask_var_t amd_e400_c1e_mask; - -void amd_e400_remove_cpu(int cpu) -{ - if (amd_e400_c1e_mask != NULL) - cpumask_clear_cpu(cpu, amd_e400_c1e_mask); -} - /* - * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt - * pending message MSR. If we detect C1E, then we handle it the same - * way as C3 power states (local apic timer and TSC stop) + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). */ static void amd_e400_idle(void) { - if (!amd_e400_c1e_detected) { - u32 lo, hi; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - amd_e400_c1e_detected = true; - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) - mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); - } + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; } - if (amd_e400_c1e_detected) { - int cpu = smp_processor_id(); + tick_broadcast_enter(); - if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { - cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* Force broadcast so ACPI can not interfere. */ - tick_broadcast_force(); - pr_info("Switch to broadcast mode on CPU%d\n", cpu); - } - tick_broadcast_enter(); - - default_idle(); + default_idle(); - /* - * The switch back from broadcast mode needs to be - * called with interrupts disabled. - */ - local_irq_disable(); - tick_broadcast_exit(); - local_irq_enable(); - } else - default_idle(); + /* + * The switch back from broadcast mode needs to be called with + * interrupts disabled. + */ + local_irq_disable(); + tick_broadcast_exit(); + local_irq_enable(); } /* @@ -448,8 +374,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c) if (x86_idle || boot_option_idle_override == IDLE_POLL) return; - if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { - /* E400: APIC timer interrupt does not wake up CPU from C1e */ + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; } else if (prefer_mwait_c1_over_halt(c)) { @@ -459,11 +384,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c) x86_idle = default_idle; } -void __init init_amd_e400_c1e_mask(void) +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) { - /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (x86_idle == amd_e400_idle) - zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..d0d744108594 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -49,7 +49,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -72,10 +71,9 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - (u16)regs->cs, regs->ip, regs->flags, - smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); + printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, + smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -232,11 +230,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -295,7 +292,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..a76b65e3e615 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -44,7 +44,6 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -61,10 +60,15 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, - regs->sp, regs->flags); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, + (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, + regs->sp, regs->flags); + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -265,9 +269,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned prev_fsindex, prev_gsindex; - fpu_switch_t fpu_switch; - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -417,7 +420,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->gsbase = 0; prev->gsindex = prev_gsindex; - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); /* * Switch the PDA and FPU contexts. diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 51402a7e4ca6..0bee04d41bed 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -625,8 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); -#endif - #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) #include <linux/jump_label.h> #include <asm/string_64.h> @@ -657,3 +655,4 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); #endif +#endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbfbca5fea0c..9c337b0e8ba7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1221,11 +1221,16 @@ void __init setup_arch(char **cmdline_p) */ get_smp_config(); + /* + * Systems w/o ACPI and mptables might not have it mapped the local + * APIC yet, but prefill_possible_map() might need to access it. + */ + init_apic_mappings(); + prefill_possible_map(); init_cpu_to_node(); - init_apic_mappings(); io_apic_init_mappings(); kvm_guest_init(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2bbd27f89802..9820d6d977c6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,7 +1,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/bootmem.h> #include <linux/percpu.h> @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/mpspec.h> #include <asm/apicdef.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index c00cb64bc0a1..68f8cc222f25 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,10 +261,8 @@ static inline void __smp_reschedule_interrupt(void) __visible void smp_reschedule_interrupt(struct pt_regs *regs) { - irq_enter(); ack_APIC_irq(); __smp_reschedule_interrupt(); - irq_exit(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42f5eb7b4f6c..0c37d4fd01b2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -58,7 +58,6 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> @@ -109,6 +108,17 @@ static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -471,22 +481,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags() | x86_sched_itmt_flags(); +} +#endif +#endif + static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { NULL, }, }; static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, @@ -821,14 +851,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -void smp_announce(void) -{ - int num_nodes = num_online_nodes(); - - printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", - num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { @@ -964,9 +986,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(idle))) - 1); - + idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; @@ -1111,7 +1131,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_disable_lazy_restore(cpu); + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; common_cpu_up(cpu, tidle); @@ -1331,7 +1351,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); - pr_info("CPU%d: ", 0); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); if (is_uv_system()) @@ -1575,7 +1595,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - amd_e400_remove_cpu(raw_smp_processor_id()); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 764a29f84de7..85195d447a92 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -66,13 +66,36 @@ __init int create_simplefb(const struct screen_info *si, { struct platform_device *pd; struct resource res; - unsigned long len; + u64 base, size; + u32 length; - /* don't use lfb_size as it may contain the whole VMEM instead of only - * the part that is occupied by the framebuffer */ - len = mode->height * mode->stride; - len = PAGE_ALIGN(len); - if (len > (u64)si->lfb_size << 16) { + /* + * If the 64BIT_BASE capability is set, ext_lfb_base will contain the + * upper half of the base address. Assemble the address, then make sure + * it is valid and we can actually access it. + */ + base = si->lfb_base; + if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE) + base |= (u64)si->ext_lfb_base << 32; + if (!base || (u64)(resource_size_t)base != base) { + printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n"); + return -EINVAL; + } + + /* + * Don't use lfb_size as IORESOURCE size, since it may contain the + * entire VMEM, and thus require huge mappings. Use just the part we + * need, that is, the part where the framebuffer is located. But verify + * that it does not exceed the advertised VMEM. + * Note that in case of VBE, the lfb_size is shifted by 16 bits for + * historical reasons. + */ + size = si->lfb_size; + if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + size <<= 16; + length = mode->height * mode->stride; + length = PAGE_ALIGN(length); + if (length > size) { printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); return -EINVAL; } @@ -81,8 +104,8 @@ __init int create_simplefb(const struct screen_info *si, memset(&res, 0, sizeof(res)); res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; res.name = simplefb_resname; - res.start = si->lfb_base; - res.end = si->lfb_base + len - 1; + res.start = base; + res.end = res.start + length - 1; if (res.end <= res.start) return -EINVAL; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bd4e3d4d3625..bf0c6d049080 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -853,6 +853,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { + unsigned long cr0; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION @@ -866,10 +868,20 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - fpu__restore(¤t->thread.fpu); /* interrupts still off */ -#ifdef CONFIG_X86_32 - cond_local_irq_enable(regs); -#endif + + /* This should not happen. */ + cr0 = read_cr0(); + if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { + /* Try to fix it up and carry on. */ + write_cr0(cr0 & ~X86_CR0_TS); + } else { + /* + * Something terrible happened, and we're better off trying + * to kill the task than getting stuck in a never-ending + * loop of #NM faults. + */ + die("unexpected #NM exception", regs, error_code); + } } NOKPROBE_SYMBOL(do_device_not_available); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a2456d4d286a..ea7b7f9a3b9e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -14,13 +14,55 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (unwind_done(state)) return 0; + if (state->regs && user_mode(state->regs)) + return 0; + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, addr_p); - return __kernel_text_address(addr) ? addr : 0; + if (!__kernel_text_address(addr)) { + printk_deferred_once(KERN_WARNING + "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n", + (void *)addr, addr_p, state->task->comm, + state->task->pid); + return 0; + } + + return addr; } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static size_t regs_size(struct pt_regs *regs) +{ + /* x86_32 regs from kernel mode are two words shorter: */ + if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) + return sizeof(*regs) - 2*sizeof(long); + + return sizeof(*regs); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + unsigned long bp = (unsigned long)state->bp; + unsigned long regs = (unsigned long)task_pt_regs(state->task); + + return bp == regs - FRAME_HEADER_SIZE; +} + +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} + static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { @@ -43,26 +85,117 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { - unsigned long *next_bp; + struct pt_regs *regs; + unsigned long *next_bp, *next_frame; + size_t next_len; + enum stack_type prev_type = state->stack_info.type; if (unwind_done(state)) return false; - next_bp = (unsigned long *)*state->bp; + /* have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread_tls(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * do_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + return true; + } + + /* get the next frame pointer */ + if (state->regs) + next_bp = (unsigned long *)state->regs->bp; + else + next_bp = (unsigned long *)*state->bp; + + /* is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + next_frame = (unsigned long *)regs; + next_len = sizeof(*regs); + } else { + next_frame = next_bp; + next_len = FRAME_HEADER_SIZE; + } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) - return false; + if (!update_stack_state(state, next_frame, next_len)) { + /* + * Don't warn on bad regs->bp. An interrupt in entry code + * might cause a false positive warning. + */ + if (state->regs) + goto the_end; + + goto bad_address; + } + + /* Make sure it only unwinds up and doesn't overlap the last frame: */ + if (state->stack_info.type == prev_type) { + if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) + goto bad_address; + + if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) + goto bad_address; + } /* move to the next frame */ - state->bp = next_bp; + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + return true; + +bad_address: + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_frame); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_frame); + } +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; } EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + unsigned long *bp, *frame; + size_t len; + memset(state, 0, sizeof(*state)); state->task = task; @@ -73,12 +206,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } /* set up the starting stack frame */ - state->bp = get_frame_pointer(task, regs); + bp = get_frame_pointer(task, regs); + regs = decode_frame_pointer(bp); + if (regs) { + state->regs = regs; + frame = (unsigned long *)regs; + len = sizeof(*regs); + } else { + state->bp = bp; + frame = bp; + len = FRAME_HEADER_SIZE; + } /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(state->bp, state->task, &state->stack_info, + get_stack_info(frame, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + update_stack_state(state, frame, len); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 9298993dc8b7..22881ddcbb9f 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -7,11 +7,15 @@ unsigned long unwind_get_return_address(struct unwind_state *state) { + unsigned long addr; + if (unwind_done(state)) return 0; + addr = READ_ONCE_NOCHECK(*state->sp); + return ftrace_graph_ret_addr(state->task, &state->graph_idx, - *state->sp, state->sp); + addr, state->sp); } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -23,9 +27,12 @@ bool unwind_next_frame(struct unwind_state *state) return false; do { - for (state->sp++; state->sp < info->end; state->sp++) - if (__kernel_text_address(*state->sp)) + for (state->sp++; state->sp < info->end; state->sp++) { + unsigned long addr = READ_ONCE_NOCHECK(*state->sp); + + if (__kernel_text_address(addr)) return true; + } state->sp = info->next_sp; @@ -47,7 +54,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); - if (!__kernel_text_address(*first_frame)) + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + if (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + !__kernel_text_address(*first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index dbf67f64d5ec..e79f15f108a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -91,10 +91,10 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; /* bootstrapping code */ HEAD_TEXT . = ALIGN(8); - _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT |