summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/amd_nb.c164
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c4
-rw-r--r--arch/x86/kernel/apm_32.c9
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c55
-rw-r--r--arch/x86/kernel/cpu/bugs.c26
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/common.c93
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c200
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c357
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c41
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c421
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c83
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c834
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c184
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/scattered.c57
-rw-r--r--arch/x86/kernel/cpu/vmware.c86
-rw-r--r--arch/x86/kernel/cpuid.c73
-rw-r--r--arch/x86/kernel/dumpstack.c70
-rw-r--r--arch/x86/kernel/dumpstack_32.c56
-rw-r--r--arch/x86/kernel/dumpstack_64.c79
-rw-r--r--arch/x86/kernel/fpu/bugs.c7
-rw-r--r--arch/x86/kernel/fpu/core.c90
-rw-r--r--arch/x86/kernel/fpu/init.c107
-rw-r--r--arch/x86/kernel/fpu/signal.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c11
-rw-r--r--arch/x86/kernel/head_32.S58
-rw-r--r--arch/x86/kernel/head_64.S60
-rw-r--r--arch/x86/kernel/irq.c1
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/itmt.c215
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/ldt.c12
-rw-r--r--arch/x86/kernel/mcount_64.S3
-rw-r--r--arch/x86/kernel/msr.c69
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c14
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c14
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c14
-rw-r--r--arch/x86/kernel/process.c149
-rw-r--r--arch/x86/kernel/process_32.c13
-rw-r--r--arch/x86/kernel/process_64.c19
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/setup_percpu.c3
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smpboot.c57
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c39
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/unwind_frame.c161
-rw-r--r--arch/x86/kernel/unwind_guess.c22
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
61 files changed, 2223 insertions, 1980 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 79076d75bdbf..05110c1097ae 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
ifdef CONFIG_FRAME_POINTER
obj-y += unwind_frame.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8a5abaa7d453..4764fa56924d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -76,6 +76,7 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
+#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
* Hotplug side:
@@ -88,6 +89,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
* ->ioapic_lock
*/
static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
/* --------------------------------------------------------------------------
Boot-time Configuration
@@ -454,6 +456,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
* stash over-ride to indicate we've been here
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4fdf6230d93c..458da8509b75 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -13,8 +13,20 @@
#include <linux/spinlock.h>
#include <asm/amd_nb.h>
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
static u32 *flush_words;
+static const struct pci_device_id amd_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ {}
+};
+
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
-EXPORT_SYMBOL(amd_nb_misc_ids);
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
@@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{}
};
@@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
{ }
};
-struct amd_northbridge_info amd_northbridges;
-EXPORT_SYMBOL(amd_northbridges);
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
static struct pci_dev *next_northbridge(struct pci_dev *dev,
const struct pci_device_id *ids)
@@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
return dev;
}
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ root = node_to_amd_nb(node)->root;
+ if (!root)
+ goto out;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(root, 0x60, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ goto out_unlock;
+ }
+
+ err = (write ? pci_write_config_dword(root, 0x64, *value)
+ : pci_read_config_dword(root, 0x64, value));
+ if (err)
+ pr_warn("Error %s SMN address 0x%x.\n",
+ (write ? "writing to" : "reading from"), address);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ struct pci_dev *F4;
+ u32 ficaa;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ ficaa = 1;
+ ficaa |= reg & 0x3FC;
+ ficaa |= (func & 0x7) << 11;
+ ficaa |= instance_id << 16;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(F4, 0x5C, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, 0x98, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
int amd_cache_northbridges(void)
{
u16 i = 0;
struct amd_northbridge *nb;
- struct pci_dev *misc, *link;
+ struct pci_dev *root, *misc, *link;
- if (amd_nb_num())
+ if (amd_northbridges.num)
return 0;
misc = NULL;
@@ -74,15 +198,17 @@ int amd_cache_northbridges(void)
if (!i)
return -ENODEV;
- nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
amd_northbridges.num = i;
- link = misc = NULL;
- for (i = 0; i != amd_nb_num(); i++) {
+ link = misc = root = NULL;
+ for (i = 0; i != amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->root = root =
+ next_northbridge(root, amd_root_ids);
node_to_amd_nb(i)->misc = misc =
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
@@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
{
u32 address;
u64 base, msr;
- unsigned segn_busn_bits;
+ unsigned int segn_busn_bits;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return NULL;
/* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ if (boot_cpu_data.x86 < 0x10)
return NULL;
address = MSR_FAM10H_MMIO_CONF_BASE;
@@ -226,14 +352,14 @@ static void amd_cache_gart(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
if (!flush_words) {
amd_northbridges.flags &= ~AMD_NB_GART;
pr_notice("Cannot initialize GART flush words, GART support disabled\n");
return;
}
- for (i = 0; i != amd_nb_num(); i++)
+ for (i = 0; i != amd_northbridges.num; i++)
pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
}
@@ -246,18 +372,20 @@ void amd_flush_garts(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- /* Avoid races between AGP and IOMMU. In theory it's not needed
- but I'm not sure if the hardware won't lose flush requests
- when another is pending. This whole thing is so expensive anyways
- that it doesn't matter to serialize more. -AK */
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
flush_words[i] | 1);
flushed++;
}
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 88c657b057e2..bb47e5eacd44 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,7 +48,6 @@
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/time.h>
#include <asm/smp.h>
@@ -894,11 +893,13 @@ void __init setup_boot_APIC_clock(void)
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
/*
@@ -2263,6 +2264,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
/* Should happen once for each apic */
WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->native_eoi_write = (*drv)->eoi_write;
(*drv)->eoi_write = eoi_write;
}
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 48e6d84f173e..945e512a112a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,7 +48,6 @@
#include <linux/bootmem.h>
#include <asm/irqdomain.h>
-#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/cpu.h>
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index aeef53ce93e1..35690a168cf7 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -815,9 +815,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
l = li;
}
addr1 = (base << shift) +
- f * (unsigned long)(1 << m_io);
+ f * (1ULL << m_io);
addr2 = (base << shift) +
- (l + 1) * (unsigned long)(1 << m_io);
+ (l + 1) * (1ULL << m_io);
pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
id, fi, li, lnasid, addr1, addr2);
if (max_io < l)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index c7364bd633e1..643818a7688b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
static unsigned int last_stime; /* = 0 */
- cputime_t stime;
+ cputime_t stime, utime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
recalc:
- task_cputime(current, NULL, &stime);
+ task_cputime(current, &utime, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
} else if (jiffies_since_last_check > idle_period) {
@@ -1042,8 +1042,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
if (apm_info.get_power_status_broken)
return APM_32_UNSUPPORTED;
- if (apm_bios_call(&call))
+ if (apm_bios_call(&call)) {
+ if (!call.err)
+ return APM_NO_ERROR;
return call.err;
+ }
*status = call.ebx;
*bat = call.ecx;
if (apm_info.get_power_status_swabinminutes) {
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f7d4ef..33b63670bf09 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-y += bugs.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
-obj-$(CONFIG_X86_32) += bugs.o
-obj-$(CONFIG_X86_64) += bugs_64.o
-
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b81fe2d63e15..71cae73a5076 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -20,6 +20,10 @@
#include "cpu.h"
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
/*
* nodes_per_socket: Stores the number of nodes per socket.
* Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
smp_num_siblings = ((ebx >> 8) & 3) + 1;
c->x86_max_cores /= smp_num_siblings;
c->cpu_core_id = ebx & 0xff;
+
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (cpuid_edx(0x80000006)) {
+ if (c->x86 == 0x17) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex id is ApicId[3].
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ }
+ }
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
} else
return;
@@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
cus_per_node = c->x86_max_cores / nodes_per_socket;
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = node_id;
-
/* core id has to be in the [0 .. cores_per_node - 1] range */
c->cpu_core_id %= cus_per_node;
}
@@ -347,7 +367,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
- unsigned int socket_id, core_complex_id;
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
@@ -357,18 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
-
- /*
- * Fix percpu cpu_llc_id here as LLC topology is different
- * for Fam17h systems.
- */
- if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
- return;
-
- socket_id = (c->apicid >> bits) - 1;
- core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3;
-
- per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id;
#endif
}
@@ -589,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
-}
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+ /*
+ * Check whether the machine is affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the check
+ * whether the machine is affected in arch_post_acpi_init(), which
+ * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_E400);
+}
static void init_amd_k8(struct cpuinfo_x86 *c)
{
@@ -774,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
-
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/* 3DNow or LM implies PREFETCHW */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..a44ef52184df 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,15 +16,19 @@
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
void __init check_bugs(void)
{
identify_boot_cpu();
-#ifndef CONFIG_SMP
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
*
@@ -40,4 +44,18 @@ void __init check_bugs(void)
alternative_instructions();
fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+ alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+#endif
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index a972ac4c7e7d..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9bd910a7dd0a..729f92ba8224 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -979,6 +979,35 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
}
/*
+ * The physical to logical package id mapping is initialized from the
+ * acpi/mptables information. Make sure that CPUID actually agrees with
+ * that.
+ */
+static void sanitize_package_id(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned int pkg, apicid, cpu = smp_processor_id();
+
+ apicid = apic->cpu_present_to_apicid(cpu);
+ pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+
+ if (apicid != c->initial_apicid) {
+ pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n",
+ cpu, apicid, c->initial_apicid);
+ c->initial_apicid = apicid;
+ }
+ if (pkg != c->phys_proc_id) {
+ pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n",
+ cpu, pkg, c->phys_proc_id);
+ c->phys_proc_id = pkg;
+ }
+ c->logical_proc_id = topology_phys_to_logical_pkg(pkg);
+#else
+ c->logical_proc_id = 0;
+#endif
+}
+
+/*
* This does the hard work of actually picking apart the CPU stuff...
*/
static void identify_cpu(struct cpuinfo_x86 *c)
@@ -1103,8 +1132,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
- /* The boot/hotplug time assigment got cleared, restore it */
- c->logical_proc_id = topology_phys_to_logical_pkg(c->phys_proc_id);
+ sanitize_package_id(c);
}
/*
@@ -1144,7 +1172,6 @@ void enable_sep_cpu(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_amd_e400_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
@@ -1162,51 +1189,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
mtrr_ap_init();
}
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
-
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_safe(index, &val))
- continue;
- pr_info(" MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
static __init int setup_noclflush(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1240,14 +1222,6 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(", stepping: 0x%x)\n", c->x86_mask);
else
pr_cont(")\n");
-
- print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
- if (c->cpu_index < show_msr)
- __print_cpu_msr();
}
static __init int setup_disablecpuid(char *arg)
@@ -1462,11 +1436,8 @@ void cpu_init(void)
*/
cr4_init_shadow();
- /*
- * Load microcode on this cpu if a valid microcode is available.
- * This is early microcode loading procedure.
- */
- load_ucode_ap();
+ if (cpu)
+ load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 631356c8cca4..c7efbcfbeda6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
*msg = s->msg;
s->covered = 1;
if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
+ if (tolerant < 1)
return MCE_PANIC_SEVERITY;
}
return s->sev;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..132e1ec67da0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -135,6 +136,9 @@ void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+ rdmsrl(MSR_PPIN, m->ppin);
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+static atomic_t num_notifiers;
+
void mce_register_decode_chain(struct notifier_block *nb)
{
+ atomic_inc(&num_notifiers);
+
/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
@@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
+ atomic_dec(&num_notifiers);
+
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = {
.misc = misc_reg
};
-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
{
- int ret = 0;
-
- pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ m->cs, m->ip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
@@ -308,6 +318,13 @@ static void print_mce(struct mce *m)
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+ int ret = 0;
+
+ __print_mce(m);
/*
* Print out human-readable details about the MCE error,
@@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = {
.priority = INT_MAX,
};
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ /*
+ * Run the default notifier if we have only the SRAO
+ * notifier and us registered.
+ */
+ if (atomic_read(&num_notifiers) > 2)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = 0,
+};
+
/*
* Read ADDR and MISC registers.
*/
@@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_gather_info(&m, NULL);
+ /*
+ * m.tsc was set in mce_setup(). Clear it if not requested.
+ *
+ * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
+ * that dance.
+ */
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+
for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
barrier();
m.status = mce_rdmsrl(msr_ops.status(i));
if (!(m.status & MCI_STATUS_VAL))
continue;
-
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_read_aux(&m, i);
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
-
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
@@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
add_timer_on(t, cpu);
}
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_pinned_timer(t, mce_timer_fn, cpu);
+}
+
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
@@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
- __mcheck_cpu_init_timer();
+ __mcheck_cpu_setup_timer();
}
/*
@@ -2138,6 +2193,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_process_work);
@@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = {
DEFINE_PER_CPU(struct device *, mce_device);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
@@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu)
if (!mce_available(&boot_cpu_data))
return -EIO;
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
dev = kzalloc(sizeof *dev, GFP_KERNEL);
if (!dev)
return -ENOMEM;
@@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu)
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void mce_disable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
-
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_clear();
vendor_disable_error_reporting();
}
-static void mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
@@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h)
}
}
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int mce_cpu_dead(unsigned int cpu)
+{
+ mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
+ int ret;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- mce_device_create(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_device_remove(cpu);
- mce_intel_hcpu_update(cpu);
+ mce_device_create(cpu);
- /* intentionally ignoring frozen here */
- if (!(action & CPU_TASKS_FROZEN))
- cmci_rediscover();
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- del_timer_sync(t);
- break;
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- mce_start_timer(cpu, t);
- break;
+ ret = mce_threshold_create_device(cpu);
+ if (ret) {
+ mce_device_remove(cpu);
+ return ret;
}
-
- return NOTIFY_OK;
+ mce_reenable_cpu();
+ mce_start_timer(cpu, t);
+ return 0;
}
-static struct notifier_block mce_cpu_notifier = {
- .notifier_call = mce_cpu_callback,
-};
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+ mce_disable_cpu();
+ del_timer_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
static __init void mce_init_banks(void)
{
@@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
+ enum cpuhp_state hp_online;
int err;
- int i = 0;
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
@@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void)
if (err)
goto err_out_mem;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = mce_device_create(i);
- if (err) {
- /*
- * Register notifier anyway (and do not unreg it) so
- * that we don't leave undeleted timers, see notifier
- * callback above.
- */
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
- goto err_device_create;
- }
- }
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+ hp_online = err;
register_syscore_ops(&mce_syscore_ops);
@@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
+ cpuhp_remove_state(hp_online);
-err_device_create:
- /*
- * We didn't keep track of which devices were created above, but
- * even if we had, the set of online cpus might have changed.
- * Play safe and remove for every possible cpu, since
- * mce_device_remove() will do the right thing.
- */
- for_each_possible_cpu(i)
- mce_device_remove(i);
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
err_out_mem:
free_cpumask_var(mce_device_initialized);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9b5403462936..ffacfdcacb85 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -24,7 +24,6 @@
#include <asm/amd_nb.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -55,6 +54,8 @@
/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
+static bool thresholding_en;
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
-struct smca_bank_name smca_bank_names[] = {
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
@@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = {
[SMCA_PSP] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(smca_bank_names);
-static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
/* ZN Core (HWID=0xB0) MCA types */
@@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
{ SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-struct smca_bank_info smca_banks[MAX_NR_BANKS];
+struct smca_bank smca_banks[MAX_NR_BANKS];
EXPORT_SYMBOL_GPL(smca_banks);
/*
@@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-/*
- * CPU Initialization
- */
-
static void get_smca_bank_info(unsigned int bank)
{
unsigned int i, hwid_mcatype, cpu = smp_processor_id();
- struct smca_hwid_mcatype *type;
- u32 high, instanceId;
- u16 hwid, mcatype;
+ struct smca_hwid *s_hwid;
+ u32 high, instance_id;
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
- hwid = high & MCI_IPID_HWID;
- mcatype = (high & MCI_IPID_MCATYPE) >> 16;
- hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
- type = &smca_hwid_mcatypes[i];
- if (hwid_mcatype == type->hwid_mcatype) {
- smca_banks[bank].type = type;
- smca_banks[bank].type_instance = instanceId;
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+
+ WARN(smca_banks[bank].hwid,
+ "Bank %s already initialized!\n",
+ smca_get_name(s_hwid->bank_type));
+
+ smca_banks[bank].hwid = s_hwid;
+ smca_banks[bank].id = instance_id;
break;
}
}
@@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+ u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+ /* We start from the normalized address */
+ u64 ret_addr = norm_addr;
+
+ u32 tmp;
+
+ u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+ u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+ u8 intlv_addr_sel, intlv_addr_bit;
+ u8 num_intlv_bits, hashed_bit;
+ u8 lgcy_mmio_hole_en, base = 0;
+ u8 cs_mask, cs_id = 0;
+ bool hash_enabled = false;
+
+ /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+ if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+ goto out_err;
+
+ /* Remove HiAddrOffset from normalized address, if enabled: */
+ if (tmp & BIT(0)) {
+ u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+ if (norm_addr >= hi_addr_offset) {
+ ret_addr -= hi_addr_offset;
+ base = 1;
+ }
+ }
+
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ /* Check if address range is valid. */
+ if (!(tmp & BIT(0))) {
+ pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+ __func__, tmp);
+ goto out_err;
+ }
+
+ lgcy_mmio_hole_en = tmp & BIT(1);
+ intlv_num_chan = (tmp >> 4) & 0xF;
+ intlv_addr_sel = (tmp >> 8) & 0x7;
+ dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+ /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+ if (intlv_addr_sel > 3) {
+ pr_err("%s: Invalid interleave address select %d.\n",
+ __func__, intlv_addr_sel);
+ goto out_err;
+ }
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ intlv_num_sockets = (tmp >> 8) & 0x1;
+ intlv_num_dies = (tmp >> 10) & 0x3;
+ dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+ intlv_addr_bit = intlv_addr_sel + 8;
+
+ /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+ switch (intlv_num_chan) {
+ case 0: intlv_num_chan = 0; break;
+ case 1: intlv_num_chan = 1; break;
+ case 3: intlv_num_chan = 2; break;
+ case 5: intlv_num_chan = 3; break;
+ case 7: intlv_num_chan = 4; break;
+
+ case 8: intlv_num_chan = 1;
+ hash_enabled = true;
+ break;
+ default:
+ pr_err("%s: Invalid number of interleaved channels %d.\n",
+ __func__, intlv_num_chan);
+ goto out_err;
+ }
+
+ num_intlv_bits = intlv_num_chan;
+
+ if (intlv_num_dies > 2) {
+ pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+ __func__, intlv_num_dies);
+ goto out_err;
+ }
+
+ num_intlv_bits += intlv_num_dies;
+
+ /* Add a bit if sockets are interleaved. */
+ num_intlv_bits += intlv_num_sockets;
+
+ /* Assert num_intlv_bits <= 4 */
+ if (num_intlv_bits > 4) {
+ pr_err("%s: Invalid interleave bits %d.\n",
+ __func__, num_intlv_bits);
+ goto out_err;
+ }
+
+ if (num_intlv_bits > 0) {
+ u64 temp_addr_x, temp_addr_i, temp_addr_y;
+ u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+ /*
+ * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+ * This is the fabric id for this coherent slave. Use
+ * umc/channel# as instance id of the coherent slave
+ * for FICAA.
+ */
+ if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+ goto out_err;
+
+ cs_fabric_id = (tmp >> 8) & 0xFF;
+ die_id_bit = 0;
+
+ /* If interleaved over more than 1 channel: */
+ if (intlv_num_chan) {
+ die_id_bit = intlv_num_chan;
+ cs_mask = (1 << die_id_bit) - 1;
+ cs_id = cs_fabric_id & cs_mask;
+ }
+
+ sock_id_bit = die_id_bit;
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (intlv_num_dies || intlv_num_sockets)
+ if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+ goto out_err;
+
+ /* If interleaved over more than 1 die. */
+ if (intlv_num_dies) {
+ sock_id_bit = die_id_bit + intlv_num_dies;
+ die_id_shift = (tmp >> 24) & 0xF;
+ die_id_mask = (tmp >> 8) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+ }
+
+ /* If interleaved over more than 1 socket. */
+ if (intlv_num_sockets) {
+ socket_id_shift = (tmp >> 28) & 0xF;
+ socket_id_mask = (tmp >> 16) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+ }
+
+ /*
+ * The pre-interleaved address consists of XXXXXXIIIYYYYY
+ * where III is the ID for this CS, and XXXXXXYYYYY are the
+ * address bits from the post-interleaved address.
+ * "num_intlv_bits" has been calculated to tell us how many "I"
+ * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+ * there are (where "I" starts).
+ */
+ temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+ temp_addr_i = (cs_id << intlv_addr_bit);
+ temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+ ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
+ }
+
+ /* Add dram base address */
+ ret_addr += dram_base_addr;
+
+ /* If legacy MMIO hole enabled */
+ if (lgcy_mmio_hole_en) {
+ if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+ goto out_err;
+
+ dram_hole_base = tmp & GENMASK(31, 24);
+ if (ret_addr >= dram_hole_base)
+ ret_addr += (BIT_ULL(32) - dram_hole_base);
+ }
+
+ if (hash_enabled) {
+ /* Save some parentheses and grab ls-bit at the end. */
+ hashed_bit = (ret_addr >> 12) ^
+ (ret_addr >> 18) ^
+ (ret_addr >> 21) ^
+ (ret_addr >> 30) ^
+ cs_id;
+
+ hashed_bit &= BIT(0);
+
+ if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+ ret_addr ^= BIT(intlv_addr_bit);
+ }
+
+ /* Is calculated system address is above DRAM limit address? */
+ if (ret_addr > dram_limit_addr)
+ goto out_err;
+
+ *sys_addr = ret_addr;
+ return 0;
+
+out_err:
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
static void
__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
{
@@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
+ struct thresh_restart tr;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void)
log:
__log_error(bank, false, true, ((u64)high << 32) | low);
+
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ threshold_restart_bank(&tr);
}
/*
@@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- if (!smca_banks[bank].type)
+ if (!smca_banks[bank].hwid)
return NULL;
- bank_type = smca_banks[bank].type->bank_type;
+ bank_type = smca_banks[bank].hwid->bank_type;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
@@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
}
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_bank_names[bank_type].name,
- smca_banks[bank].type_instance);
+ "%s_%x", smca_get_name(bank_type),
+ smca_banks[bank].id);
return buf_mcatype;
}
@@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
return err;
}
-/* create dir/files for all valid threshold banks */
-static int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- struct threshold_bank **bp;
- int err = 0;
-
- bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
- GFP_KERNEL);
- if (!bp)
- return -ENOMEM;
-
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- return err;
- }
-
- return err;
-}
-
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
@@ -1102,48 +1304,71 @@ free_out:
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
-static void threshold_remove_device(unsigned int cpu)
+int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
+ if (!thresholding_en)
+ return 0;
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
kfree(per_cpu(threshold_banks, cpu));
+ per_cpu(threshold_banks, cpu) = NULL;
+ return 0;
}
-/* get notified when a cpu comes on/off */
-static void
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
+ unsigned int bank;
+ struct threshold_bank **bp;
+ int err = 0;
+
+ if (!thresholding_en)
+ return 0;
+
+ bp = per_cpu(threshold_banks, cpu);
+ if (bp)
+ return 0;
+
+ bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto err;
}
+ return err;
+err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
+ if (mce_threshold_vector == amd_threshold_interrupt)
+ thresholding_en = true;
+
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
+ int err = mce_threshold_create_device(lcpu);
if (err)
return err;
}
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..190b3e6cef4d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -11,6 +11,8 @@
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void)
* Reset the counter if we've logged an error in the last poll
* during the storm.
*/
- if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
else
this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +344,7 @@ void cmci_recheck(void)
return;
local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
@@ -464,11 +466,46 @@ static void intel_clear_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
}
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ /*
+ * Even if testing the presence of the MSR would be enough, we don't
+ * want to risk the situation where other models reuse this MSR for
+ * other purposes.
+ */
+ switch (c->x86_model) {
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SKYLAKE_X:
+ if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+ return;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN available but disabled: */
+ return;
+ }
+
+ /* If PPIN is disabled, but not locked, try to enable: */
+ if (!(val & 3UL)) {
+ wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_PPIN_CTL, &val);
+ }
+
+ if ((val & 3UL) == 2UL)
+ set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+ }
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
+ intel_ppin_init(c);
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6b9dc4d18ccc..465aca8be009 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -26,7 +26,6 @@
#include <asm/processor.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int thermal_throttle_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
- int err = 0;
-
- dev = get_cpu_device(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = thermal_throttle_add_dev(dev, cpu);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- thermal_throttle_remove_dev(dev);
- break;
- }
- return notifier_from_errno(err);
+ struct device *dev = get_cpu_device(cpu);
+
+ return thermal_throttle_add_dev(dev, cpu);
}
-static struct notifier_block thermal_throttle_cpu_notifier =
+static int thermal_throttle_offline(unsigned int cpu)
{
- .notifier_call = thermal_throttle_cpu_callback,
-};
+ struct device *dev = get_cpu_device(cpu);
+
+ thermal_throttle_remove_dev(dev);
+ return 0;
+}
static __init int thermal_throttle_init_device(void)
{
- unsigned int cpu = 0;
- int err;
+ int ret;
if (!atomic_read(&therm_throt_en))
return 0;
- cpu_notifier_register_begin();
-
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
- WARN_ON(err);
- }
-
- __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
- cpu_notifier_register_done();
-
- return 0;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+ thermal_throttle_online,
+ thermal_throttle_offline);
+ return ret < 0 ? ret : 0;
}
device_initcall(thermal_throttle_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fcf9ae9384f4..9beb092d68a5 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -6,7 +6,6 @@
#include <asm/irq_vectors.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 220b1a508513..ba12e8aa4a45 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,4 +1,4 @@
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 620ab06bcf45..6f353bdb3a25 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -5,6 +5,7 @@
* CPUs and later.
*
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
@@ -39,64 +40,25 @@
static struct equiv_cpu_entry *equiv_cpu_table;
-struct ucode_patch {
- struct list_head plist;
- void *data;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
/*
* This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents.
*/
-static u8 *container;
-static size_t container_size;
-static bool ucode_builtin;
+struct container {
+ u8 *data;
+ size_t size;
+} cont;
static u32 ucode_new_rev;
static u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
-static struct cpio_data ucode_cpio;
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- char *path;
- void *start;
- size_t size;
-
- /*
- * Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/early-microcode.txt
- */
- static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- /*
- * On 32-bit, early load occurs before paging is turned on so we need
- * to use physical addresses.
- */
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- path = (char *)__pa_nodebug(ucode_path);
- start = (void *)p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-#else
- path = ucode_path;
- start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
- size = boot_params.hdr.ramdisk_size;
-#endif /* !CONFIG_X86_32 */
-
- return find_cpio_data(path, start, size, NULL);
-#else
- return (struct cpio_data){ NULL, 0, "" };
-#endif
-}
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
static size_t compute_container_size(u8 *data, u32 total_size)
{
@@ -135,48 +97,48 @@ static size_t compute_container_size(u8 *data, u32 total_size)
return size;
}
+static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
+ unsigned int sig)
+{
+ int i = 0;
+
+ if (!equiv_cpu_table)
+ return 0;
+
+ while (equiv_cpu_table[i].installed_cpu != 0) {
+ if (sig == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
+ i++;
+ }
+ return 0;
+}
+
/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+static struct container
+find_proper_container(u8 *ucode, size_t size, u16 *ret_id)
{
+ struct container ret = { NULL, 0 };
+ u32 eax, ebx, ecx, edx;
struct equiv_cpu_entry *eq;
- size_t *cont_sz;
- u32 *header;
- u8 *data, **cont;
- u8 (*patch)[PATCH_MAX_SIZE];
- u16 eq_id = 0;
int offset, left;
- u32 rev, eax, ebx, ecx, edx;
- u32 *new_rev;
-
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- cont_sz = (size_t *)__pa_nodebug(&container_size);
- cont = (u8 **)__pa_nodebug(&container);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
- new_rev = &ucode_new_rev;
- cont_sz = &container_size;
- cont = &container;
- patch = &amd_ucode_patch;
-#endif
+ u16 eq_id = 0;
+ u32 *header;
+ u8 *data;
data = ucode;
left = size;
header = (u32 *)data;
+
/* find equiv cpu table */
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
- return;
+ return ret;
eax = 0x00000001;
ecx = 0;
@@ -185,7 +147,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
while (left > 0) {
eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
- *cont = data;
+ ret.data = data;
/* Advance past the container header */
offset = header[2] + CONTAINER_HDR_SZ;
@@ -194,15 +156,15 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
eq_id = find_equiv_id(eq, eax);
if (eq_id) {
- this_equiv_id = eq_id;
- *cont_sz = compute_container_size(*cont, left + offset);
+ ret.size = compute_container_size(ret.data, left + offset);
/*
* truncate how much we need to iterate over in the
* ucode update loop below
*/
- left = *cont_sz - offset;
- break;
+ left = ret.size - offset;
+ *ret_id = eq_id;
+ return ret;
}
/*
@@ -212,6 +174,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
*/
while (left > 0) {
header = (u32 *)data;
+
if (header[0] == UCODE_MAGIC &&
header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
break;
@@ -226,14 +189,64 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
ucode = data;
}
- if (!eq_id) {
- *cont = NULL;
- *cont_sz = 0;
- return;
- }
+ return ret;
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc_amd)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc_amd->hdr.patch_id)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
+static struct container
+apply_microcode_early_amd(void *ucode, size_t size, bool save_patch)
+{
+ struct container ret = { NULL, 0 };
+ u8 (*patch)[PATCH_MAX_SIZE];
+ int offset, left;
+ u32 rev, *header;
+ u8 *data;
+ u16 eq_id = 0;
+ u32 *new_rev;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ patch = &amd_ucode_patch;
+#endif
if (check_current_patch_level(&rev, true))
- return;
+ return (struct container){ NULL, 0 };
+
+ ret = find_proper_container(ucode, size, &eq_id);
+ if (!eq_id)
+ return (struct container){ NULL, 0 };
+
+ this_equiv_id = eq_id;
+ header = (u32 *)ret.data;
+
+ /* We're pointing to an equiv table, skip over it. */
+ data = ret.data + header[2] + CONTAINER_HDR_SZ;
+ left = ret.size - (header[2] + CONTAINER_HDR_SZ);
while (left > 0) {
struct microcode_amd *mc;
@@ -252,8 +265,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
*new_rev = rev;
if (save_patch)
- memcpy(patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
+ memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
@@ -261,10 +273,10 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
data += offset;
left -= offset;
}
+ return ret;
}
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
- unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -281,47 +293,45 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
void __init load_ucode_amd_bsp(unsigned int family)
{
+ struct ucode_cpu_info *uci;
struct cpio_data cp;
- bool *builtin;
- void **data;
- size_t *size;
+ const char *path;
+ bool use_pa;
-#ifdef CONFIG_X86_32
- data = (void **)__pa_nodebug(&ucode_cpio.data);
- size = (size_t *)__pa_nodebug(&ucode_cpio.size);
- builtin = (bool *)__pa_nodebug(&ucode_builtin);
-#else
- data = &ucode_cpio.data;
- size = &ucode_cpio.size;
- builtin = &ucode_builtin;
-#endif
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ uci = ucode_cpu_info;
+ path = ucode_path;
+ use_pa = false;
+ }
- *builtin = load_builtin_amd_microcode(&cp, family);
- if (!*builtin)
- cp = find_ucode_in_initrd();
+ if (!get_builtin_microcode(&cp, family))
+ cp = find_microcode_in_initrd(path, use_pa);
if (!(cp.data && cp.size))
return;
- *data = cp.data;
- *size = cp.size;
+ /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */
+ uci->cpu_sig.sig = cpuid_eax(1);
- apply_ucode_in_initrd(cp.data, cp.size, true);
+ apply_microcode_early_amd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
/*
* On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory.
+ * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP.
+ * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
* which is used upon resume from suspend.
*/
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
{
struct microcode_amd *mc;
- size_t *usize;
- void **ucode;
+ struct cpio_data cp;
mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
@@ -329,60 +339,63 @@ void load_ucode_amd_ap(void)
return;
}
- ucode = (void *)__pa_nodebug(&container);
- usize = (size_t *)__pa_nodebug(&container_size);
+ if (!get_builtin_microcode(&cp, family))
+ cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true);
- if (!*ucode || !*usize)
+ if (!(cp.data && cp.size))
return;
- apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+ /*
+ * This would set amd_ucode_patch above so that the following APs can
+ * use it directly instead of going down this path again.
+ */
+ apply_microcode_early_amd(cp.data, cp.size, true);
}
#else
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
{
- unsigned int cpu = smp_processor_id();
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
- u8 *cont = container;
u32 rev, eax;
u16 eq_id;
- /* Exit if called on the BSP. */
- if (!cpu)
+ /* 64-bit runs with paging enabled, thus early==false. */
+ if (check_current_patch_level(&rev, false))
return;
- if (!container)
- return;
+ /* First AP hasn't cached it yet, go through the blob. */
+ if (!cont.data) {
+ struct cpio_data cp = { NULL, 0, "" };
- /*
- * 64-bit runs with paging enabled, thus early==false.
- */
- if (check_current_patch_level(&rev, false))
- return;
+ if (cont.size == -1)
+ return;
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+reget:
+ if (!get_builtin_microcode(&cp, family)) {
+#ifdef CONFIG_BLK_DEV_INITRD
+ cp = find_cpio_data(ucode_path, (void *)initrd_start,
+ initrd_end - initrd_start, NULL);
+#endif
+ if (!(cp.data && cp.size)) {
+ /*
+ * Mark it so that other APs do not scan again
+ * for no real reason and slow down boot
+ * needlessly.
+ */
+ cont.size = -1;
+ return;
+ }
+ }
+
+ cont = apply_microcode_early_amd(cp.data, cp.size, false);
+ if (!(cont.data && cont.size)) {
+ cont.size = -1;
+ return;
+ }
+ }
eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
+ eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
@@ -397,61 +410,50 @@ void load_ucode_amd_ap(void)
}
} else {
- if (!ucode_cpio.data)
- return;
/*
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+ goto reget;
}
}
-#endif
+#endif /* CONFIG_X86_32 */
+
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
-int __init save_microcode_in_initrd_amd(void)
+int __init save_microcode_in_initrd_amd(unsigned int fam)
{
- unsigned long cont;
- int retval = 0;
enum ucode_state ret;
- u8 *cont_va;
- u32 eax;
+ int retval = 0;
+ u16 eq_id;
- if (!container)
- return -EINVAL;
+ if (!cont.data) {
+ if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
+ struct cpio_data cp = { NULL, 0, "" };
-#ifdef CONFIG_X86_32
- get_bsp_sig();
- cont = (unsigned long)container;
- cont_va = __va(container);
-#else
- /*
- * We need the physical address of the container for both bitness since
- * boot_params.hdr.ramdisk_image is a physical address.
- */
- cont = __pa(container);
- cont_va = container;
+#ifdef CONFIG_BLK_DEV_INITRD
+ cp = find_cpio_data(ucode_path, (void *)initrd_start,
+ initrd_end - initrd_start, NULL);
#endif
- /*
- * Take into account the fact that the ramdisk might get relocated and
- * therefore we need to recompute the container's position in virtual
- * memory space.
- */
- if (relocated_ramdisk)
- container = (u8 *)(__va(relocated_ramdisk) +
- (cont - boot_params.hdr.ramdisk_image));
- else
- container = cont_va;
+ if (!(cp.data && cp.size)) {
+ cont.size = -1;
+ return -EINVAL;
+ }
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+ cont = find_proper_container(cp.data, cp.size, &eq_id);
+ if (!eq_id) {
+ cont.size = -1;
+ return -EINVAL;
+ }
- eax = cpuid_eax(0x00000001);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ } else
+ return -EINVAL;
+ }
- ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+ ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size);
if (ret != UCODE_OK)
retval = -EINVAL;
@@ -459,8 +461,8 @@ int __init save_microcode_in_initrd_amd(void)
* This will be freed any msec now, stash patches for the current
* family and switch to patch cache for cpu hotplug, etc later.
*/
- container = NULL;
- container_size = 0;
+ cont.data = NULL;
+ cont.size = 0;
return retval;
}
@@ -478,8 +480,10 @@ void reload_ucode_amd(void)
return;
mc = (struct microcode_amd *)amd_ucode_patch;
+ if (!mc)
+ return;
- if (mc && rev < mc->hdr.patch_id) {
+ if (rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
@@ -513,7 +517,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist)
+ list_for_each_entry(p, &microcode_cache, plist)
if (p->equiv_cpu == equiv_cpu)
return p;
return NULL;
@@ -523,7 +527,7 @@ static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist) {
+ list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
if (p->patch_id >= new_patch->patch_id)
/* we already have the latest patch */
@@ -536,14 +540,14 @@ static void update_cache(struct ucode_patch *new_patch)
}
}
/* no patch found, add it */
- list_add_tail(&new_patch->plist, &pcache);
+ list_add_tail(&new_patch->plist, &microcode_cache);
}
static void free_cache(void)
{
struct ucode_patch *p, *tmp;
- list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
__list_del(p->plist.prev, p->plist.next);
kfree(p->data);
kfree(p);
@@ -663,21 +667,7 @@ bool check_current_patch_level(u32 *rev, bool early)
return ret;
}
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc_amd->hdr.patch_id)
- return -1;
-
- return 0;
-}
-
-int apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_amd *mc_amd;
@@ -860,7 +850,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5ce5155f0695..6996413c78c3 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
- * 2013-2015 Borislav Petkov <bp@alien8.de>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* X86 CPU microcode early update for Linux:
*
@@ -39,12 +39,15 @@
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/setup.h>
-#define MICROCODE_VERSION "2.01"
+#define DRIVER_VERSION "2.2"
static struct microcode_ops *microcode_ops;
static bool dis_ucode_ldr;
+LIST_HEAD(microcode_cache);
+
/*
* Synchronization.
*
@@ -167,7 +170,7 @@ void load_ucode_ap(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- load_ucode_amd_ap();
+ load_ucode_amd_ap(family);
break;
default:
break;
@@ -185,7 +188,7 @@ static int __init save_microcode_in_initrd(void)
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
- return save_microcode_in_initrd_amd();
+ return save_microcode_in_initrd_amd(c->x86);
break;
default:
break;
@@ -194,6 +197,58 @@ static int __init save_microcode_in_initrd(void)
return -EINVAL;
}
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *params;
+
+ if (use_pa)
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ else
+ params = &boot_params;
+
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ if (size)
+ start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ /*
+ * Did we relocate the ramdisk?
+ *
+ * So we possibly relocate the ramdisk *after* applying microcode on the
+ * BSP so we rely on use_pa (use physical addresses) - even if it is not
+ * absolutely correct - to determine whether we've done the ramdisk
+ * relocation already.
+ */
+ if (!use_pa && relocated_ramdisk)
+ start = initrd_start;
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
void reload_early_microcode(void)
{
int vendor, family;
@@ -453,16 +508,17 @@ static struct attribute_group mc_attr_group = {
static void microcode_fini_cpu(int cpu)
{
- microcode_ops->microcode_fini_cpu(cpu);
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
- pr_debug("CPU%d updated upon resume\n", cpu);
-
if (apply_microcode_on_target(cpu))
return UCODE_ERROR;
+ pr_debug("CPU%d updated upon resume\n", cpu);
+
return UCODE_OK;
}
@@ -496,6 +552,9 @@ static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ /* Refresh CPU microcode revision after resume. */
+ collect_cpu_info(cpu);
+
if (uci->valid)
return microcode_resume_cpu(cpu);
@@ -579,12 +638,7 @@ static int mc_cpu_down_prep(unsigned int cpu)
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
- /*
- * When a CPU goes offline, don't free up or invalidate the copy of
- * the microcode in kernel memory, so that we can reuse it when the
- * CPU comes back online without unnecessarily requesting the userspace
- * for it again.
- */
+
return 0;
}
@@ -649,8 +703,7 @@ int __init microcode_init(void)
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+ pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
return 0;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..54d50c3694d8 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,125 +39,83 @@
#include <asm/setup.h>
#include <asm/msr.h>
-/*
- * Temporary microcode blobs pointers storage. We note here during early load
- * the pointers to microcode blobs we've got from whatever storage (detached
- * initrd, builtin). Later on, we put those into final storage
- * mc_saved_data.mc_saved.
- *
- * Important: those are offsets from the beginning of initrd or absolute
- * addresses within the kernel image when built-in.
- */
-static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
-
-static struct mc_saved_data {
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
-} mc_saved_data;
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
-/* Microcode blobs within the initrd. 0 if builtin. */
-static struct ucode_blobs {
- unsigned long start;
- bool valid;
-} blobs;
+/* Current microcode patch used in early patching */
+struct microcode_intel *intel_ucode_patch;
-/* Go through saved patches and find the one suitable for the current CPU. */
-static enum ucode_state
-find_microcode_patch(struct microcode_intel **saved,
- unsigned int num_saved, struct ucode_cpu_info *uci)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
{
- struct microcode_intel *ucode_ptr, *new_mc = NULL;
- struct microcode_header_intel *mc_hdr;
- int new_rev, ret, i;
-
- new_rev = uci->cpu_sig.rev;
-
- for (i = 0; i < num_saved; i++) {
- ucode_ptr = saved[i];
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- ret = has_newer_microcode(ucode_ptr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- new_rev);
- if (!ret)
- continue;
-
- new_rev = mc_hdr->rev;
- new_mc = ucode_ptr;
- }
+ if (s1 != s2)
+ return false;
- if (!new_mc)
- return UCODE_NFOUND;
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
- uci->mc = (struct microcode_intel *)new_mc;
- return UCODE_OK;
+ /* ... or they intersect. */
+ return p1 & p2;
}
-static inline void
-copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
- unsigned long off, int num_saved)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
int i;
- for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
-{
- int i;
- struct microcode_intel ***mc_saved;
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
- mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
- for (i = 0; i < mcs->num_saved; i++) {
- struct microcode_intel *p;
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
- p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
- mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
}
+ return 0;
}
-#endif
-static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- unsigned long offset, struct ucode_cpu_info *uci)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mcs->num_saved;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!mcs->mc_saved) {
- copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
+ if (mc_hdr->rev <= new_rev)
+ return 0;
- return find_microcode_patch(mc_saved_tmp, count, uci);
- } else {
-#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mcs);
- return find_microcode_patch(mc_saved_tmp, count, uci);
-#else
- return find_microcode_patch(mcs->mc_saved, count, uci);
-#endif
- }
+ return find_matching_signature(mc, csig, cpf);
}
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
*/
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
- unsigned long sig)
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+ unsigned long sig)
{
- unsigned int fam, model;
- unsigned int fam_ucode, model_ucode;
- struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
- int ext_sigcount, i;
+ struct extended_sigtable *ext_header;
+ unsigned int fam_ucode, model_ucode;
struct extended_signature *ext_sig;
+ unsigned int fam, model;
+ int ext_sigcount, i;
fam = x86_family(sig);
model = x86_model(sig);
@@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(mc_header->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
- return UCODE_NFOUND;
+ return false;
ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(ext_sig->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
ext_sig++;
}
- return UCODE_NFOUND;
+ return false;
}
-static int
-save_microcode(struct mc_saved_data *mcs,
- struct microcode_intel **mc_saved_src,
- unsigned int num_saved)
+static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
{
- int i, j;
- struct microcode_intel **saved_ptr;
- int ret;
+ struct ucode_patch *p;
- if (!num_saved)
- return -EINVAL;
+ p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
- /*
- * Copy new microcode data.
- */
- saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
- if (!saved_ptr)
- return -ENOMEM;
-
- for (i = 0; i < num_saved; i++) {
- struct microcode_header_intel *mc_hdr;
- struct microcode_intel *mc;
- unsigned long size;
-
- if (!mc_saved_src[i]) {
- ret = -EINVAL;
- goto err;
- }
+ p->data = kmemdup(data, size, GFP_KERNEL);
+ if (!p->data) {
+ kfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
- mc = mc_saved_src[i];
- mc_hdr = &mc->hdr;
- size = get_totalsize(mc_hdr);
+ return p;
+}
- saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
- if (!saved_ptr[i]) {
- ret = -ENOMEM;
- goto err;
+static void save_microcode_patch(void *data, unsigned int size)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ struct ucode_patch *iter, *tmp, *p;
+ bool prev_found = false;
+ unsigned int sig, pf;
+
+ mc_hdr = (struct microcode_header_intel *)data;
+
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+ mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (find_matching_signature(data, sig, pf)) {
+ prev_found = true;
+
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
+
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer %p\n", data);
+ else
+ list_replace(&iter->plist, &p->plist);
}
}
/*
- * Point to newly saved microcode.
+ * There weren't any previous patches found in the list cache; save the
+ * newly found.
*/
- mcs->mc_saved = saved_ptr;
- mcs->num_saved = num_saved;
-
- return 0;
-
-err:
- for (j = 0; j <= i; j++)
- kfree(saved_ptr[j]);
- kfree(saved_ptr);
-
- return ret;
+ if (!prev_found) {
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer for %p\n", data);
+ else
+ list_add_tail(&p->plist, &microcode_cache);
+ }
}
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- * patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
- u8 *ucode_ptr, unsigned int num_saved)
+static int microcode_sanity_check(void *mc, int print_err)
{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- unsigned int sig, pf;
- int found = 0, i;
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
- for (i = 0; i < num_saved; i++) {
- mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
- if (!find_matching_signature(ucode_ptr, sig, pf))
- continue;
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format.\n");
+ return -EINVAL;
+ }
- found = 1;
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
/*
- * Found an older ucode saved earlier. Replace it with
- * this newer one.
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
*/
- mc_saved[i] = (struct microcode_intel *)ucode_ptr;
- break;
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
}
- /* Newly detected microcode, save it to memory. */
- if (i >= num_saved && !found)
- mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+ if (!ext_table_size)
+ return 0;
- return num_saved;
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
}
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
*/
-static enum ucode_state __init
-get_matching_model_microcode(unsigned long start, void *data, size_t size,
- struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci)
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
struct microcode_header_intel *mc_header;
- unsigned int num_saved = mcs->num_saved;
- enum ucode_state state = UCODE_OK;
- unsigned int leftover = size;
- u8 *ucode_ptr = data;
+ struct microcode_intel *patch = NULL;
unsigned int mc_size;
- int i;
-
- while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
- if (leftover < sizeof(mc_header))
+ while (size) {
+ if (size < sizeof(struct microcode_header_intel))
break;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
+ mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size || mc_size > leftover ||
- microcode_sanity_check(ucode_ptr, 0) < 0)
+ if (!mc_size ||
+ mc_size > size ||
+ microcode_sanity_check(data, 0) < 0)
break;
- leftover -= mc_size;
+ size -= mc_size;
- /*
- * Since APs with same family and model as the BSP may boot in
- * the platform, we need to find and save microcode patches
- * with the same family and model as the BSP.
- */
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
- ucode_ptr += mc_size;
+ if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ data += mc_size;
continue;
}
- num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
+ if (save) {
+ save_microcode_patch(data, mc_size);
+ goto next;
+ }
- ucode_ptr += mc_size;
- }
- if (leftover) {
- state = UCODE_ERROR;
- return state;
- }
+ if (!patch) {
+ if (!has_newer_microcode(data,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ uci->cpu_sig.rev))
+ goto next;
- if (!num_saved) {
- state = UCODE_NFOUND;
- return state;
- }
+ } else {
+ struct microcode_header_intel *phdr = &patch->hdr;
- for (i = 0; i < num_saved; i++)
- mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+ if (!has_newer_microcode(data,
+ phdr->sig,
+ phdr->pf,
+ phdr->rev))
+ goto next;
+ }
+
+ /* We have a newer patch, save it. */
+ patch = data;
- mcs->num_saved = num_saved;
+next:
+ data += mc_size;
+ }
- return state;
+ if (size)
+ return NULL;
+
+ return patch;
}
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
unsigned int family, model;
- struct cpu_signature csig;
+ struct cpu_signature csig = { 0 };
unsigned int eax, ebx, ecx, edx;
- csig.sig = 0;
- csig.pf = 0;
- csig.rev = 0;
-
memset(uci, 0, sizeof(*uci));
eax = 0x00000001;
@@ -374,8 +382,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- family = x86_family(csig.sig);
- model = x86_model(csig.sig);
+ family = x86_family(eax);
+ model = x86_model(eax);
if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
@@ -401,40 +409,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
static void show_saved_mc(void)
{
#ifdef DEBUG
- int i, j;
+ int i = 0, j;
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
+ struct ucode_patch *p;
- if (!mc_saved_data.num_saved) {
+ if (list_empty(&microcode_cache)) {
pr_debug("no microcode data saved.\n");
return;
}
- pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
collect_cpu_info_early(&uci);
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
- for (i = 0; i < mc_saved_data.num_saved; i++) {
+ list_for_each_entry(p, &microcode_cache, plist) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
- int ext_sigcount;
struct extended_signature *ext_sig;
+ int ext_sigcount;
+
+ mc_saved_header = (struct microcode_header_intel *)p->data;
- mc_saved_header = (struct microcode_header_intel *)
- mc_saved_data.mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
- date = mc_saved_header->date;
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ date = mc_saved_header->date;
+
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
- i, sig, pf, rev, total_size,
+ i++, sig, pf, rev, total_size,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
@@ -443,7 +452,7 @@ static void show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -456,85 +465,43 @@ static void show_saved_mc(void)
ext_sig++;
}
-
}
#endif
}
/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc, unsigned int size)
{
#ifdef CONFIG_HOTPLUG_CPU
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count_init;
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
- int ret, i;
-
mutex_lock(&x86_cpu_microcode_mutex);
- mc_saved_count_init = mc_saved_data.num_saved;
- num_saved = mc_saved_data.num_saved;
- mc_saved = mc_saved_data.mc_saved;
-
- if (mc_saved && num_saved)
- memcpy(mc_saved_tmp, mc_saved,
- num_saved * sizeof(struct microcode_intel *));
- /*
- * Save the microcode patch mc in mc_save_tmp structure if it's a newer
- * version.
- */
- num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
-
- /*
- * Save the mc_save_tmp in global mc_saved_data.
- */
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
- if (ret) {
- pr_err("Cannot save microcode patch.\n");
- goto out;
- }
-
+ save_microcode_patch(mc, size);
show_saved_mc();
- /*
- * Free old saved microcode data.
- */
- if (mc_saved) {
- for (i = 0; i < mc_saved_count_init; i++)
- kfree(mc_saved[i]);
- kfree(mc_saved);
- }
-
-out:
mutex_unlock(&x86_cpu_microcode_mutex);
#endif
}
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
-#ifdef CONFIG_X86_64
- unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+ unsigned int eax = 1, ebx, ecx = 0, edx;
char name[30];
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
native_cpuid(&eax, &ebx, &ecx, &edx);
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
return get_builtin_firmware(cp, name);
-#else
- return false;
-#endif
}
/*
@@ -570,8 +537,7 @@ void show_ucode_info_early(void)
}
/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
+ * At this point, we can not call printk() yet. Delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
static void print_ucode(struct ucode_cpu_info *uci)
@@ -648,206 +614,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
return 0;
}
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
- */
int __init save_microcode_in_initrd_intel(void)
{
- struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data.num_saved;
- unsigned long offset = 0;
- int ret;
-
- if (!count)
- return 0;
+ struct ucode_cpu_info uci;
+ struct cpio_data cp;
/*
- * We have found a valid initrd but it might've been relocated in the
- * meantime so get its updated address.
+ * AP loading didn't find any microcode patch, no need to save anything.
*/
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
- offset = initrd_start;
-
- copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
+ if (!intel_ucode_patch || IS_ERR(intel_ucode_patch))
+ return 0;
- ret = save_microcode(&mc_saved_data, mc_saved, count);
- if (ret)
- pr_err("Cannot save microcode patches from initrd.\n");
- else
- show_saved_mc();
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path, false);
- return ret;
-}
+ if (!(cp.data && cp.size))
+ return 0;
-static __init enum ucode_state
-__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
- char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
- : ucode_name;
-# ifdef CONFIG_X86_32
- unsigned long start = 0, size;
- struct boot_params *params;
+ collect_cpu_info_early(&uci);
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- size = params->hdr.ramdisk_size;
+ scan_microcode(cp.data, cp.size, &uci, true);
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
- start = (size ? params->hdr.ramdisk_image : 0);
+ show_saved_mc();
-# else /* CONFIG_X86_64 */
- unsigned long start = 0, size;
+ return 0;
+}
- size = (u64)boot_params.ext_ramdisk_size << 32;
- size |= boot_params.hdr.ramdisk_size;
- if (size) {
- start = (u64)boot_params.ext_ramdisk_image << 32;
- start |= boot_params.hdr.ramdisk_image;
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+{
+ static const char *path;
+ struct cpio_data cp;
+ bool use_pa;
- start += PAGE_OFFSET;
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ path = ucode_path;
+ use_pa = false;
}
-# endif
-
- *cd = find_cpio_data(p, (void *)start, size, NULL);
- if (cd->data) {
- blbp->start = start;
- blbp->valid = true;
- return UCODE_OK;
- } else
-#endif /* CONFIG_BLK_DEV_INITRD */
- return UCODE_ERROR;
-}
+ /* try built-in microcode first */
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(path, use_pa);
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
-{
- struct cpio_data cd = { NULL, 0, "" };
- enum ucode_state ret;
+ if (!(cp.data && cp.size))
+ return NULL;
- /* try built-in microcode first */
- if (load_builtin_intel_microcode(&cd))
- /*
- * Invalidate blobs as we might've gotten an initrd too,
- * supplied by the boot loader, by mistake or simply forgotten
- * there. That's fine, we ignore it since we've found builtin
- * microcode already.
- */
- blbp->valid = false;
- else {
- ret = __scan_microcode_initrd(&cd, blbp);
- if (ret != UCODE_OK)
- return ret;
- }
+ collect_cpu_info_early(uci);
- return get_matching_model_microcode(blbp->start, cd.data, cd.size,
- mcs, mc_ptrs, uci);
+ return scan_microcode(cp.data, cp.size, uci, false);
}
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_blobs *blbp)
+void __init load_ucode_intel_bsp(void)
{
+ struct microcode_intel *patch;
struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- collect_cpu_info_early(&uci);
- ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
- if (ret != UCODE_OK)
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
return;
- ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
- if (ret != UCODE_OK)
- return;
+ uci.mc = patch;
apply_microcode_early(&uci, true);
}
-void __init load_ucode_intel_bsp(void)
+void load_ucode_intel_ap(void)
{
- struct ucode_blobs *blobs_p;
- struct mc_saved_data *mcs;
- unsigned long *ptrs;
+ struct microcode_intel *patch, **iup;
+ struct ucode_cpu_info uci;
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
+ if (IS_ENABLED(CONFIG_X86_32))
+ iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+ else
+ iup = &intel_ucode_patch;
+
+reget:
+ if (!*iup) {
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ *iup = patch;
+ }
- _load_ucode_intel_bsp(mcs, ptrs, blobs_p);
+ uci.mc = *iup;
+
+ if (apply_microcode_early(&uci, true)) {
+ /* Mixed-silicon system? Try to refetch the proper patch: */
+ *iup = NULL;
+
+ goto reget;
+ }
}
-void load_ucode_intel_ap(void)
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
{
- struct ucode_blobs *blobs_p;
- unsigned long *ptrs, start = 0;
- struct mc_saved_data *mcs;
- struct ucode_cpu_info uci;
- enum ucode_state ret;
+ struct microcode_header_intel *phdr;
+ struct ucode_patch *iter, *tmp;
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
-
- /*
- * If there is no valid ucode previously saved in memory, no need to
- * update ucode on this AP.
- */
- if (!mcs->num_saved)
- return;
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- if (blobs_p->valid) {
- start = blobs_p->start;
+ phdr = (struct microcode_header_intel *)iter->data;
- /*
- * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
- * physmem mapping too and there we have the initrd.
- */
- start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
- }
+ if (phdr->rev <= uci->cpu_sig.rev)
+ continue;
- collect_cpu_info_early(&uci);
- ret = load_microcode(mcs, ptrs, start, &uci);
- if (ret != UCODE_OK)
- return;
+ if (!find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
+ continue;
- apply_microcode_early(&uci, true);
+ return iter->data;
+ }
+ return NULL;
}
void reload_ucode_intel(void)
{
+ struct microcode_intel *p;
struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- if (!mc_saved_data.num_saved)
- return;
collect_cpu_info_early(&uci);
- ret = find_microcode_patch(mc_saved_data.mc_saved,
- mc_saved_data.num_saved, &uci);
- if (ret != UCODE_OK)
+ p = find_patch(&uci);
+ if (!p)
return;
+ uci.mc = p;
+
apply_microcode_early(&uci, false);
}
@@ -879,24 +779,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
return 0;
}
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc, int cpu)
-{
- struct cpu_signature cpu_sig;
- unsigned int csig, cpf, crev;
-
- collect_cpu_info(cpu, &cpu_sig);
-
- csig = cpu_sig.sig;
- cpf = cpu_sig.pf;
- crev = cpu_sig.rev;
-
- return has_newer_microcode(mc, csig, cpf, crev);
-}
-
static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc;
@@ -911,16 +793,12 @@ static int apply_microcode_intel(int cpu)
uci = ucode_cpu_info + cpu;
mc = uci->mc;
- if (!mc)
- return 0;
-
- /*
- * Microcode on this CPU could be updated earlier. Only apply the
- * microcode patch in mc when it is newer than the one on this
- * CPU.
- */
- if (!get_matching_mc(mc, cpu))
- return 0;
+ if (!mc) {
+ /* Look for a newer patch in our cache: */
+ mc = find_patch(uci);
+ if (!mc)
+ return 0;
+ }
/* write microcode via MSR 0x79 */
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
@@ -962,7 +840,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
unsigned int csig, cpf;
@@ -1015,14 +892,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
if (leftover) {
vfree(new_mc);
- state = UCODE_ERROR;
- goto out;
+ return UCODE_ERROR;
}
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
- }
+ if (!new_mc)
+ return UCODE_NFOUND;
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
@@ -1032,12 +906,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc);
+ save_mc_for_early(new_mc, curr_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-out:
- return state;
+
+ return UCODE_OK;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -1081,20 +955,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size)
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
- .microcode_fini_cpu = microcode_fini_cpu,
};
struct microcode_ops * __init init_intel_microcode(void)
@@ -1109,4 +974,3 @@ struct microcode_ops * __init init_intel_microcode(void)
return &microcode_intel_ops;
}
-
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
deleted file mode 100644
index 406cb6c0d9dd..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
-{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return find_matching_signature(mc, csig, cpf);
-}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f44c5a50ab8..6c044543545e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -25,7 +25,6 @@
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
-#include <asm/idle.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 1db8dc490b66..d1316f9c8329 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -17,11 +17,17 @@ struct cpuid_bit {
u32 sub_leaf;
};
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { 0, 0, 0, 0, 0 }
};
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
@@ -30,18 +36,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { 0, 0, 0, 0, 0 }
- };
-
for (cb = cpuid_bits; cb->feature; cb++) {
/* Verify that the level is valid */
@@ -50,10 +44,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
max_level > (cb->level | 0xffff))
continue;
- cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
- &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
}
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+ enum cpuid_regs_idx reg)
+{
+ const struct cpuid_bit *cb;
+ u32 cpuid_val = 0;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ if (level > cb->level)
+ continue;
+
+ if (level < cb->level)
+ break;
+
+ if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+ if (cpu_has(&boot_cpu_data, cb->feature))
+ cpuid_val |= BIT(cb->bit);
+ }
+ }
+
+ return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5130985b758b..891f4dad7b2c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,11 +24,16 @@
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/clocksource.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
+#include <asm/timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -48,6 +53,8 @@
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");
+static unsigned long vmware_tsc_khz __ro_after_init;
+
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -57,35 +64,80 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz, lpj;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
- (unsigned long) tsc_hz / 1000,
- (unsigned long) tsc_hz % 1000);
-
- if (!preset_lpj) {
- lpj = ((u64)tsc_hz * 1000);
- do_div(lpj, HZ);
- preset_lpj = lpj;
- }
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = 0;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long vmware_sched_clock(void)
+{
+ unsigned long long ns;
- return tsc_hz;
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
}
+static void __init vmware_sched_clock_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pv_time_ops.sched_clock = vmware_sched_clock;
+ pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_cpu_ops.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz && vmw_sched_clock)
+ vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
+ uint64_t lpj, tsc_khz;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
lapic_timer_frequency = ecx / HZ;
@@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ vmware_paravirt_ops_setup();
+
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2836de390f95..0931a105ffe1 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -45,10 +45,7 @@
#include <asm/msr.h>
static struct class *cpuid_class;
-
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
+static enum cpuhp_state cpuhp_cpuid_state;
static void cpuid_smp_cpuid(void *cmd_block)
{
@@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = {
.open = cpuid_open,
};
-static int cpuid_device_create(int cpu)
+static int cpuid_device_create(unsigned int cpu)
{
struct device *dev;
@@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void cpuid_device_destroy(int cpu)
+static int cpuid_device_destroy(unsigned int cpu)
{
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
}
-static int cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = cpuid_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- cpuid_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block cpuid_class_cpu_notifier =
-{
- .notifier_call = cpuid_class_cpu_callback,
-};
-
static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
@@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
static int __init cpuid_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
"cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
cpuid_class = class_create(THIS_MODULE, "cpuid");
if (IS_ERR(cpuid_class)) {
@@ -177,45 +149,28 @@ static int __init cpuid_init(void)
}
cpuid_class->devnode = cpuid_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = cpuid_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
- err = 0;
- goto out;
+ cpuhp_cpuid_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i) {
- cpuid_device_destroy(i);
- }
- cpu_notifier_register_done();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-out:
return err;
}
+module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- cpuid_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_cpuid_state);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 9b7cf5c28f5f..0cfd01d2754c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,7 +22,6 @@
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -46,14 +45,7 @@ static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
touch_nmi_watchdog();
- printk("%s [<%p>] %s%pB\n",
- log_lvl, (void *)address, reliable ? "" : "? ",
- (void *)address);
-}
-
-void printk_address(unsigned long address)
-{
- pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -67,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ? : get_stack_pointer(task, regs);
/*
* Iterate through the stacks, starting with the current stack pointer.
@@ -82,8 +75,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - softirq stack
* - hardirq stack
*/
- for (; stack; stack = stack_info.next_sp) {
- const char *str_begin, *str_end;
+ for (regs = NULL; stack; stack = stack_info.next_sp) {
+ const char *stack_name;
/*
* If we overflowed the task stack into a guard page, jump back
@@ -95,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
- stack_type_str(stack_info.type, &str_begin, &str_end);
- if (str_begin)
- printk("%s <%s> ", log_lvl, str_begin);
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -112,13 +105,22 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
for (; stack < stack_info.end; stack++) {
unsigned long real_addr;
int reliable = 0;
- unsigned long addr = *stack;
+ unsigned long addr = READ_ONCE_NOCHECK(*stack);
unsigned long *ret_addr_p =
unwind_get_return_address_ptr(&state);
if (!__kernel_text_address(addr))
continue;
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by __show_regs() below.
+ */
+ if (regs && stack == &regs->ip) {
+ unwind_next_frame(&state);
+ continue;
+ }
+
if (stack == ret_addr_p)
reliable = 1;
@@ -146,10 +148,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* of the addresses will just be printed as unreliable.
*/
unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state);
+ if (regs)
+ __show_regs(regs, 0);
}
- if (str_end)
- printk("%s <%s> ", log_lvl, str_end);
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
}
}
@@ -164,12 +171,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
if (!sp && task == current)
sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, "");
+ show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
}
void show_stack_regs(struct pt_regs *regs)
{
- show_stack_log_lvl(current, regs, NULL, "");
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -261,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
}
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
+ (void *)regs->ip, ss, sp);
#else
/* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip);
- printk(" RSP <%016lx>\n", regs->sp);
+ printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
#endif
return 0;
}
@@ -291,22 +295,6 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
-static int __init kstack_setup(char *s)
-{
- ssize_t ret;
- unsigned long val;
-
- if (!s)
- return -EINVAL;
-
- ret = kstrtoul(s, 0, &val);
- if (ret)
- return ret;
- kstack_depth_to_print = val;
- return 0;
-}
-early_param("kstack", kstack_setup);
-
static int __init code_bytes_setup(char *s)
{
ssize_t ret;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 06eb322b5f9f..bb3b5b9a6899 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,18 +16,15 @@
#include <asm/stacktrace.h>
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
- switch (type) {
- case STACK_TYPE_IRQ:
- case STACK_TYPE_SOFTIRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ return NULL;
}
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
@@ -109,8 +106,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -121,36 +120,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %08lx", log_lvl, *stack++);
- } else
- pr_cont(" %08lx", *stack++);
- touch_nmi_watchdog();
- }
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -168,8 +137,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- pr_emerg("Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
+ show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 36cf1a498227..fac189efcc34 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- switch (type) {
- case STACK_TYPE_IRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
- *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
- *end = "EOE";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+ return NULL;
}
static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
@@ -128,8 +122,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -140,56 +136,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *irq_stack_end;
- unsigned long *irq_stack;
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
- irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- unsigned long word;
-
- if (stack >= irq_stack && stack <= irq_stack_end) {
- if (stack == irq_stack_end) {
- stack = (unsigned long *) (irq_stack_end[-1]);
- pr_cont(" <EOI> ");
- }
- } else {
- if (kstack_end(stack))
- break;
- }
-
- if (probe_kernel_address(stack, word))
- break;
-
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %016lx", log_lvl, word);
- } else
- pr_cont(" %016lx", word);
-
- stack++;
- touch_nmi_watchdog();
- }
-
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -207,8 +153,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index aad34aafc0e0..d913047f832c 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -23,17 +23,12 @@ static double __initdata y = 3145727.0;
*/
void __init fpu__init_check_bugs(void)
{
- u32 cr0_saved;
s32 fdiv_bug;
/* kernel_fpu_begin/end() relies on patched alternative instructions. */
if (!boot_cpu_has(X86_FEATURE_FPU))
return;
- /* We might have CR0::TS set already, clear it: */
- cr0_saved = read_cr0();
- write_cr0(cr0_saved & ~X86_CR0_TS);
-
kernel_fpu_begin();
/*
@@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void)
kernel_fpu_end();
- write_cr0(cr0_saved);
-
if (fdiv_bug) {
set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
pr_warn("Hmm, FPU with FDIV bug\n");
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 47004010ad5d..e4e97a5355ce 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void)
return this_cpu_read(in_kernel_fpu);
}
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
static bool interrupted_kernel_fpu_idle(void)
{
- if (kernel_fpu_disabled())
- return false;
-
- if (use_eager_fpu())
- return true;
-
- return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+ return !kernel_fpu_disabled();
}
/*
@@ -125,8 +107,7 @@ void __kernel_fpu_begin(void)
*/
copy_fpregs_to_fpstate(fpu);
} else {
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- __fpregs_activate_hw();
+ __cpu_invalidate_fpregs_state();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -137,8 +118,6 @@ void __kernel_fpu_end(void)
if (fpu->fpregs_active)
copy_kernel_to_fpregs(&fpu->state);
- else
- __fpregs_deactivate_hw();
kernel_fpu_enable();
}
@@ -159,35 +138,6 @@ void kernel_fpu_end(void)
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
- * CR0::TS save/restore functions:
- */
-int irq_ts_save(void)
-{
- /*
- * If in process context and not atomic, we can take a spurious DNA fault.
- * Otherwise, doing clts() in process context requires disabling preemption
- * or some heavy lifting like kernel_fpu_begin()
- */
- if (!in_atomic())
- return 0;
-
- if (read_cr0() & X86_CR0_TS) {
- clts();
- return 1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_ts_save);
-
-void irq_ts_restore(int TS_state)
-{
- if (TS_state)
- stts();
-}
-EXPORT_SYMBOL_GPL(irq_ts_restore);
-
-/*
* Save the FPU state (mark it for reload if necessary):
*
* This only ever gets called for the current task.
@@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_before_save(fpu);
if (fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(fpu)) {
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&fpu->state);
- else
- fpregs_deactivate(fpu);
+ copy_kernel_to_fpregs(&fpu->state);
}
}
trace_x86_fpu_after_save(fpu);
@@ -247,7 +194,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
- dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
@@ -260,8 +206,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
- if (use_eager_fpu())
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* Save current FPU registers directly into the child
@@ -283,10 +228,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
memcpy(&src_fpu->state, &dst_fpu->state,
fpu_kernel_xstate_size);
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&src_fpu->state);
- else
- fpregs_deactivate(src_fpu);
+ copy_kernel_to_fpregs(&src_fpu->state);
}
preempt_enable();
@@ -366,7 +308,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
if (fpu->fpstate_active) {
/* Invalidate any lazy state: */
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
@@ -409,7 +351,7 @@ void fpu__current_fpstate_write_begin(void)
* ensures we will not be lazy and skip a XRSTOR in the
* future.
*/
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
}
/*
@@ -459,7 +401,6 @@ void fpu__restore(struct fpu *fpu)
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
copy_kernel_to_fpregs(&fpu->state);
- fpu->counter++;
trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
@@ -477,7 +418,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
void fpu__drop(struct fpu *fpu)
{
preempt_disable();
- fpu->counter = 0;
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
@@ -521,14 +461,14 @@ void fpu__clear(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
- if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
- /* FPU state will be reallocated lazily at the first use. */
- fpu__drop(fpu);
- } else {
- if (!fpu->fpstate_active) {
- fpu__activate_curr(fpu);
- user_fpu_begin();
- }
+ fpu__drop(fpu);
+
+ /*
+ * Make sure fpstate is cleared and initialized.
+ */
+ if (static_cpu_has(X86_FEATURE_FPU)) {
+ fpu__activate_curr(fpu);
+ user_fpu_begin();
copy_init_fpstate_to_fpregs();
}
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7ccb85..60dece392b3a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -10,18 +10,6 @@
#include <linux/init.h>
/*
- * Initialize the TS bit in CR0 according to the style of context-switches
- * we are using:
- */
-static void fpu__init_cpu_ctx_switch(void)
-{
- if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
- stts();
- else
- clts();
-}
-
-/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
static void fpu__init_cpu_generic(void)
@@ -58,7 +46,6 @@ void fpu__init_cpu(void)
{
fpu__init_cpu_generic();
fpu__init_cpu_xstate();
- fpu__init_cpu_ctx_switch();
}
/*
@@ -233,82 +220,16 @@ static void __init fpu__init_system_xstate_size_legacy(void)
}
/*
- * FPU context switching strategies:
- *
- * Against popular belief, we don't do lazy FPU saves, due to the
- * task migration complications it brings on SMP - we only do
- * lazy FPU restores.
- *
- * 'lazy' is the traditional strategy, which is based on setting
- * CR0::TS to 1 during context-switch (instead of doing a full
- * restore of the FPU state), which causes the first FPU instruction
- * after the context switch (whenever it is executed) to fault - at
- * which point we lazily restore the FPU state into FPU registers.
- *
- * Tasks are of course under no obligation to execute FPU instructions,
- * so it can easily happen that another context-switch occurs without
- * a single FPU instruction being executed. If we eventually switch
- * back to the original task (that still owns the FPU) then we have
- * not only saved the restores along the way, but we also have the
- * FPU ready to be used for the original task.
- *
- * 'lazy' is deprecated because it's almost never a performance win
- * and it's much more complicated than 'eager'.
- *
- * 'eager' switching is by default on all CPUs, there we switch the FPU
- * state during every context switch, regardless of whether the task
- * has used FPU instructions in that time slice or not. This is done
- * because modern FPU context saving instructions are able to optimize
- * state saving and restoration in hardware: they can detect both
- * unused and untouched FPU state and optimize accordingly.
- *
- * [ Note that even in 'lazy' mode we might optimize context switches
- * to use 'eager' restores, if we detect that a task is using the FPU
- * frequently. See the fpu->counter logic in fpu/internal.h for that. ]
- */
-static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
-
-/*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- /* Support all xfeatures known to us */
- if (eagerfpu != DISABLE)
- return XCNTXT_MASK;
-
- /* Warning of xfeatures being disabled for no eagerfpu mode */
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- }
-
- /* Return a mask that masks out all features requiring eagerfpu mode */
- return ~XFEATURE_MASK_EAGER;
+ return XCNTXT_MASK;
}
-/*
- * Disable features dependent on eagerfpu.
- */
-static void __init fpu__clear_eager_fpu_features(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_MPX);
-}
-
-/*
- * Pick the FPU context switching strategy:
- *
- * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
- * the following is true:
- *
- * (1) the cpu has xsaveopt, as it has the optimization and doing eager
- * FPU switching has a relatively low cost compared to a plain xsave;
- * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
- * switching. Should the kernel boot with noxsaveopt, we support MPX
- * with eager FPU switching at a higher cost.
- */
+/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
static bool on_boot_cpu __initdata = 1;
@@ -317,17 +238,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
-
- if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
- eagerfpu = ENABLE;
-
- if (xfeatures_mask & XFEATURE_MASK_EAGER)
- eagerfpu = ENABLE;
-
- if (eagerfpu == ENABLE)
- setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
- printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
}
/*
@@ -336,11 +246,6 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
- eagerfpu = DISABLE;
- fpu__clear_eager_fpu_features();
- }
-
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -375,14 +280,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
*/
fpu__init_cpu();
- /*
- * But don't leave CR0::TS set yet, as some of the FPU setup
- * methods depend on being able to execute FPU instructions
- * that will fault on a set TS, such as the FXSAVE in
- * fpu__init_system_mxcsr().
- */
- clts();
-
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a184c210efba..83c23c230b4c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
fpu->fpstate_active = 1;
- if (use_eager_fpu()) {
- preempt_disable();
- fpu__restore(fpu);
- preempt_enable();
- }
+ preempt_disable();
+ fpu__restore(fpu);
+ preempt_enable();
return err;
} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 095ef7ddd6ae..1d7770447b3e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -73,6 +74,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
@@ -890,15 +892,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
*/
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
- /*
- * For most XSAVE components, this would be an arduous task:
- * brining fpstate up to date with fpregs, updating fpstate,
- * then re-populating fpregs. But, for components that are
- * never lazily managed, we can just access the fpregs
- * directly. PKRU is never managed lazily, so we can just
- * manipulate it directly. Make sure it stays that way.
- */
- WARN_ON_ONCE(!use_eager_fpu());
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b6b2f0264af3..4e8577d03372 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -63,6 +63,8 @@
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
+#define SIZEOF_PTREGS 17*4
+
/*
* Number of possible pages in the lowmem region.
*
@@ -248,19 +250,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
+ jb .Ldefault_entry
/* Paravirt-compatible boot parameters. Look to see what architecture
we're booting under. */
movl pa(boot_params + BP_hardware_subarch), %eax
cmpl $num_subarch_entries, %eax
- jae bad_subarch
+ jae .Lbad_subarch
movl pa(subarch_entries)(,%eax,4), %eax
subl $__PAGE_OFFSET, %eax
jmp *%eax
-bad_subarch:
+.Lbad_subarch:
WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
@@ -270,14 +272,14 @@ WEAK(xen_entry)
__INITDATA
subarch_entries:
- .long default_entry /* normal x86/PC */
+ .long .Ldefault_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
- .long default_entry /* Moorestown MID */
+ .long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
.previous
#else
- jmp default_entry
+ jmp .Ldefault_entry
#endif /* CONFIG_PARAVIRT */
#ifdef CONFIG_HOTPLUG_CPU
@@ -289,7 +291,8 @@ num_subarch_entries = (. - subarch_entries) / 4
ENTRY(start_cpu0)
movl initial_stack, %ecx
movl %ecx, %esp
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
ENDPROC(start_cpu0)
#endif
@@ -317,7 +320,7 @@ ENTRY(startup_32_smp)
call load_ucode_ap
#endif
-default_entry:
+.Ldefault_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
@@ -347,7 +350,7 @@ default_entry:
pushfl
popl %eax # get EFLAGS
testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
- jz enable_paging # hw disallowed setting of ID bit
+ jz .Lenable_paging # hw disallowed setting of ID bit
# which means no CPUID and no CR4
xorl %eax,%eax
@@ -357,13 +360,13 @@ default_entry:
movl $1,%eax
cpuid
andl $~1,%edx # Ignore CPUID.FPU
- jz enable_paging # No flags or only CPUID.FPU = no CR4
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
- jz enable_paging
+ jz .Lenable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
@@ -371,7 +374,7 @@ default_entry:
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
- ja enable_paging
+ ja .Lenable_paging
/* Clear bogus XD_DISABLE bits */
call verify_cpu
@@ -380,7 +383,7 @@ default_entry:
cpuid
/* Execute Disable bit supported? */
btl $(X86_FEATURE_NX & 31), %edx
- jnc enable_paging
+ jnc .Lenable_paging
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
@@ -390,7 +393,7 @@ default_entry:
/* Make changes effective */
wrmsr
-enable_paging:
+.Lenable_paging:
/*
* Enable paging
@@ -419,7 +422,7 @@ enable_paging:
*/
movb $4,X86 # at least 486
cmpl $-1,X86_CPUID
- je is486
+ je .Lis486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
@@ -430,7 +433,7 @@ enable_paging:
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
- je is486
+ je .Lis486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
@@ -444,7 +447,7 @@ enable_paging:
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
-is486:
+.Lis486:
movl $0x50022,%ecx # set AM, WP, NE and MP
movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
@@ -470,8 +473,9 @@ is486:
xorl %eax,%eax # Clear LDT
lldt %ax
- pushl $0 # fake return address for unwinder
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(startup_32_smp)
#include "verify_cpu.S"
@@ -665,14 +669,17 @@ __PAGE_ALIGNED_BSS
initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(initial_page_table)
+.globl initial_page_table
+initial_page_table:
.fill 1024,4,0
#endif
initial_pg_fixmap:
.fill 1024,4,0
-ENTRY(empty_zero_page)
+.globl empty_zero_page
+empty_zero_page:
.fill 4096,1,0
-ENTRY(swapper_pg_dir)
+.globl swapper_pg_dir
+swapper_pg_dir:
.fill 1024,4,0
EXPORT_SYMBOL(empty_zero_page)
@@ -706,7 +713,12 @@ ENTRY(initial_page_table)
.data
.balign 4
ENTRY(initial_stack)
- .long init_thread_union+THREAD_SIZE
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+ TOP_OF_KERNEL_STACK_PADDING;
__INITRODATA
int_msg:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b4421cc191b0..90de28841242 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,13 +66,8 @@ startup_64:
* tables and then reload them.
*/
- /*
- * Setup stack for verify_cpu(). "-8" because initial_stack is defined
- * this way, see below. Our best guess is a NULL ptr for stack
- * termination heuristics and we don't want to break anything which
- * might depend on it (kgdb, ...).
- */
- leaq (__end_init_task - 8)(%rip), %rsp
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
/* Sanitize CPU configuration */
call verify_cpu
@@ -117,20 +112,20 @@ startup_64:
movq %rdi, %rax
shrq $PGDIR_SHIFT, %rax
- leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
movq %rdx, 0(%rbx,%rax,8)
movq %rdx, 8(%rbx,%rax,8)
- addq $4096, %rdx
+ addq $PAGE_SIZE, %rdx
movq %rdi, %rax
shrq $PUD_SHIFT, %rax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
incl %eax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- addq $8192, %rbx
+ addq $PAGE_SIZE * 2, %rbx
movq %rdi, %rax
shrq $PMD_SHIFT, %rdi
addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
@@ -147,6 +142,9 @@ startup_64:
decl %ecx
jnz 1b
+ test %rbp, %rbp
+ jz .Lskip_fixup
+
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
@@ -154,9 +152,9 @@ startup_64:
* beyond _end.
*/
leaq level2_kernel_pgt(%rip), %rdi
- leaq 4096(%rdi), %r8
+ leaq PAGE_SIZE(%rdi), %r8
/* See if it is a valid page table entry */
-1: testb $1, 0(%rdi)
+1: testb $_PAGE_PRESENT, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
@@ -167,6 +165,7 @@ startup_64:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
+.Lskip_fixup:
movq $(early_level4_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
@@ -265,13 +264,17 @@ ENTRY(secondary_startup_64)
movl $MSR_GS_BASE,%ecx
movl initial_gs(%rip),%eax
movl initial_gs+4(%rip),%edx
- wrmsr
+ wrmsr
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
-
- /* Finally jump to run C code and to be on real kernel address
+ jmp start_cpu
+ENDPROC(secondary_startup_64)
+
+ENTRY(start_cpu)
+ /*
+ * Jump to run C code and to be on a real kernel address.
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
@@ -295,12 +298,13 @@ ENTRY(secondary_startup_64)
* REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
* address given in m16:64.
*/
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
+ call 1f # put return address on stack for unwinder
+1: xorq %rbp, %rbp # clear frame pointer
+ movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
-ENDPROC(secondary_startup_64)
+ENDPROC(start_cpu)
#include "verify_cpu.S"
@@ -308,15 +312,11 @@ ENDPROC(secondary_startup_64)
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
* up already except stack. We just set up stack here. Then call
- * start_secondary().
+ * start_secondary() via start_cpu().
*/
ENTRY(start_cpu0)
- movq initial_stack(%rip),%rsp
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ movq initial_stack(%rip), %rsp
+ jmp start_cpu
ENDPROC(start_cpu0)
#endif
@@ -328,7 +328,11 @@ ENDPROC(start_cpu0)
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
GLOBAL(initial_stack)
- .quad init_thread_union+THREAD_SIZE-8
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
bad_address:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9f669fdd2010..7c6e9ffe4424 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -14,7 +14,6 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 9ebd0b0e73d9..6b0678a541e2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -16,7 +16,6 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <asm/io_apic.h>
-#include <asm/idle.h>
#include <asm/apic.h>
int sysctl_panic_on_stackoverflow;
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..cb9c1ed1d391
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,215 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old_sysctl;
+ int ret;
+
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return -EINVAL;
+ }
+
+ old_sysctl = sysctl_sched_itmt_enabled;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+ {
+ .procname = "sched_itmt_enabled",
+ .data = &sysctl_sched_itmt_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_itmt_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {}
+};
+
+static struct ctl_table itmt_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = itmt_kern_table,
+ },
+ {}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return 0;
+ }
+
+ itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ if (!itmt_sysctl_header) {
+ mutex_unlock(&itmt_update_mutex);
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ if (sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return;
+ }
+ sched_itmt_capable = false;
+
+ if (itmt_sysctl_header) {
+ unregister_sysctl_table(itmt_sysctl_header);
+ itmt_sysctl_header = NULL;
+ }
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of cpu core
+ * @core_cpu: The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+ int cpu, i = 1;
+
+ for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+ int smt_prio;
+
+ /*
+ * Ensure that the siblings are moved to the end
+ * of the priority chain and only used when
+ * all other high priority cpus are out of capacity.
+ */
+ smt_prio = prio * smp_num_siblings / i;
+ per_cpu(sched_core_priority, cpu) = smt_prio;
+ i++;
+ }
+}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc854e39..36bc66416021 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,7 +42,6 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#include <asm/idle.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
@@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
prev_state = exception_enter();
- exit_idle();
kvm_async_pf_task_wait((u32)read_cr2());
exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
- exit_idle();
kvm_async_pf_task_wake((u32)read_cr2());
rcu_irq_exit();
break;
@@ -308,7 +305,7 @@ static void kvm_register_steal_time(void)
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
-static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
/**
* This relies on __test_and_clear_bit to modify the memory
@@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
*/
if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
- apic_write(APIC_EOI, APIC_EOI_ACK);
+ apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
}
static void kvm_guest_cpu_init(void)
@@ -592,6 +589,14 @@ out:
local_irq_restore(flags);
}
+__visible bool __kvm_vcpu_is_preempted(int cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!src->preempted;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -608,6 +613,11 @@ void __init kvm_spinlock_init(void)
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_lock_ops.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6707039b9032..f09df2ff1bcc 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm)
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int size)
{
struct ldt_struct *new_ldt;
- int alloc_size;
+ unsigned int alloc_size;
if (size > LDT_ENTRIES)
return NULL;
@@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int oldsize, newsize;
+ struct user_desc ldt_info;
struct desc_struct ldt;
int error;
- struct user_desc ldt_info;
- int oldsize, newsize;
- struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
old_ldt = mm->context.ldt;
oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+ newsize = max(ldt_info.entry_number + 1, oldsize);
error = -ENOMEM;
new_ldt = alloc_ldt_struct(newsize);
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index efe73aacf966..7b0d3da52fb4 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -18,8 +18,10 @@
#ifdef CC_USING_FENTRY
# define function_hook __fentry__
+EXPORT_SYMBOL(__fentry__)
#else
# define function_hook mcount
+EXPORT_SYMBOL(mcount)
#endif
/* All cases save the original rbp (8 bytes) */
@@ -295,7 +297,6 @@ trace:
jmp fgraph_trace
END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
-EXPORT_SYMBOL(function_hook)
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7f3550acde1b..f5e3ff835cc8 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -44,6 +44,7 @@
#include <asm/msr.h>
static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -180,7 +181,7 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
-static int msr_device_create(int cpu)
+static int msr_device_create(unsigned int cpu)
{
struct device *dev;
@@ -189,34 +190,12 @@ static int msr_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void msr_device_destroy(int cpu)
+static int msr_device_destroy(unsigned int cpu)
{
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
}
-static int msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = msr_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- msr_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block __refdata msr_class_cpu_notifier = {
- .notifier_call = msr_class_cpu_callback,
-};
-
static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
@@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode)
static int __init msr_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
msr_class = class_create(THIS_MODULE, "msr");
if (IS_ERR(msr_class)) {
@@ -239,44 +216,28 @@ static int __init msr_init(void)
}
msr_class->devnode = msr_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = msr_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
-
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i)
- msr_device_destroy(i);
- cpu_notifier_register_done();
+ cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-out:
return err;
}
+module_init(msr_init);
static void __exit msr_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- msr_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(msr_init);
module_exit(msr_exit)
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a003b793..6d4bf812af45 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
}
-
PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
bool pv_is_native_spin_unlock(void)
@@ -21,12 +20,25 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
}
+__visible bool __native_vcpu_is_preempted(int cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_lock_ops.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+ .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bbf3d5933eaa..a1bfba0f7234 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
- .clts = native_clts,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 920c6ae08592..d33ef165b1f8 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
@@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index bb3840cedb4f..f4fcf26c9fce 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
@@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..43c36d8a6ae2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -23,7 +23,6 @@
#include <asm/cpu.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
-#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/mwait.h>
#include <asm/fpu/internal.h>
@@ -65,23 +64,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
-#ifdef CONFIG_X86_64
-static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-#endif
-
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -251,39 +233,9 @@ static inline void play_dead(void)
}
#endif
-#ifdef CONFIG_X86_64
-void enter_idle(void)
-{
- this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
-}
-#endif
-
void arch_cpu_idle_enter(void)
{
local_touch_nmi();
- enter_idle();
-}
-
-void arch_cpu_idle_exit(void)
-{
- __exit_idle();
}
void arch_cpu_idle_dead(void)
@@ -336,59 +288,33 @@ void stop_this_cpu(void *dummy)
halt();
}
-bool amd_e400_c1e_detected;
-EXPORT_SYMBOL(amd_e400_c1e_detected);
-
-static cpumask_var_t amd_e400_c1e_mask;
-
-void amd_e400_remove_cpu(int cpu)
-{
- if (amd_e400_c1e_mask != NULL)
- cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
-}
-
/*
- * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
*/
static void amd_e400_idle(void)
{
- if (!amd_e400_c1e_detected) {
- u32 lo, hi;
-
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- amd_e400_c1e_detected = true;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- pr_info("System has AMD C1E enabled\n");
- }
+ /*
+ * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+ * gets set after static_cpu_has() places have been converted via
+ * alternatives.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ default_idle();
+ return;
}
- if (amd_e400_c1e_detected) {
- int cpu = smp_processor_id();
+ tick_broadcast_enter();
- if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
- cpumask_set_cpu(cpu, amd_e400_c1e_mask);
- /* Force broadcast so ACPI can not interfere. */
- tick_broadcast_force();
- pr_info("Switch to broadcast mode on CPU%d\n", cpu);
- }
- tick_broadcast_enter();
-
- default_idle();
+ default_idle();
- /*
- * The switch back from broadcast mode needs to be
- * called with interrupts disabled.
- */
- local_irq_disable();
- tick_broadcast_exit();
- local_irq_enable();
- } else
- default_idle();
+ /*
+ * The switch back from broadcast mode needs to be called with
+ * interrupts disabled.
+ */
+ local_irq_disable();
+ tick_broadcast_exit();
+ local_irq_enable();
}
/*
@@ -448,8 +374,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
if (x86_idle || boot_option_idle_override == IDLE_POLL)
return;
- if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
- /* E400: APIC timer interrupt does not wake up CPU from C1e */
+ if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
} else if (prefer_mwait_c1_over_halt(c)) {
@@ -459,11 +384,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
x86_idle = default_idle;
}
-void __init init_amd_e400_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
{
- /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
- if (x86_idle == amd_e400_idle)
- zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ pr_info("System has AMD C1E enabled\n");
}
static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8efdc4c..d0d744108594 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -49,7 +49,6 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
@@ -72,10 +71,9 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}
- printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
+ printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
+ printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
+ smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -232,11 +230,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- fpu_switch_t fpu_switch;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -295,7 +292,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
this_cpu_write(current_task, next_p);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..a76b65e3e615 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -44,7 +44,6 @@
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
@@ -61,10 +60,15 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
+ (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
+ regs->sp, regs->flags);
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -265,9 +269,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
- fpu_switch_t fpu_switch;
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -417,7 +420,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
/*
* Switch the PDA and FPU contexts.
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 51402a7e4ca6..0bee04d41bed 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -625,8 +625,6 @@ static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
amd_disable_seq_and_redirect_scrub);
-#endif
-
#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
#include <linux/jump_label.h>
#include <asm/string_64.h>
@@ -657,3 +655,4 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
#endif
+#endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bbfbca5fea0c..9c337b0e8ba7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1221,11 +1221,16 @@ void __init setup_arch(char **cmdline_p)
*/
get_smp_config();
+ /*
+ * Systems w/o ACPI and mptables might not have it mapped the local
+ * APIC yet, but prefill_possible_map() might need to access it.
+ */
+ init_apic_mappings();
+
prefill_possible_map();
init_cpu_to_node();
- init_apic_mappings();
io_apic_init_mappings();
kvm_guest_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2bbd27f89802..9820d6d977c6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,7 +1,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
@@ -12,6 +12,7 @@
#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index c00cb64bc0a1..68f8cc222f25 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -261,10 +261,8 @@ static inline void __smp_reschedule_interrupt(void)
__visible void smp_reschedule_interrupt(struct pt_regs *regs)
{
- irq_enter();
ack_APIC_irq();
__smp_reschedule_interrupt();
- irq_exit();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42f5eb7b4f6c..0c37d4fd01b2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -58,7 +58,6 @@
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
@@ -109,6 +108,17 @@ static bool logical_packages_frozen __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+ int retval = x86_topology_update;
+
+ x86_topology_update = false;
+ return retval;
+}
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -471,22 +481,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+ return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ NULL, },
};
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
@@ -821,14 +851,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-void smp_announce(void)
-{
- int num_nodes = num_online_nodes();
-
- printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
- num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
@@ -964,9 +986,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int cpu0_nmi_registered = 0;
unsigned long timeout;
- idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(idle))) - 1);
-
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
@@ -1111,7 +1131,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
return err;
/* the FPU context is blank, nobody can own it */
- __cpu_disable_lazy_restore(cpu);
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
common_cpu_up(cpu, tidle);
@@ -1331,7 +1351,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
- pr_info("CPU%d: ", 0);
+ pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
if (is_uv_system())
@@ -1575,7 +1595,6 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- amd_e400_remove_cpu(raw_smp_processor_id());
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 764a29f84de7..85195d447a92 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -66,13 +66,36 @@ __init int create_simplefb(const struct screen_info *si,
{
struct platform_device *pd;
struct resource res;
- unsigned long len;
+ u64 base, size;
+ u32 length;
- /* don't use lfb_size as it may contain the whole VMEM instead of only
- * the part that is occupied by the framebuffer */
- len = mode->height * mode->stride;
- len = PAGE_ALIGN(len);
- if (len > (u64)si->lfb_size << 16) {
+ /*
+ * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
+ * upper half of the base address. Assemble the address, then make sure
+ * it is valid and we can actually access it.
+ */
+ base = si->lfb_base;
+ if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
+ base |= (u64)si->ext_lfb_base << 32;
+ if (!base || (u64)(resource_size_t)base != base) {
+ printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Don't use lfb_size as IORESOURCE size, since it may contain the
+ * entire VMEM, and thus require huge mappings. Use just the part we
+ * need, that is, the part where the framebuffer is located. But verify
+ * that it does not exceed the advertised VMEM.
+ * Note that in case of VBE, the lfb_size is shifted by 16 bits for
+ * historical reasons.
+ */
+ size = si->lfb_size;
+ if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
+ size <<= 16;
+ length = mode->height * mode->stride;
+ length = PAGE_ALIGN(length);
+ if (length > size) {
printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
return -EINVAL;
}
@@ -81,8 +104,8 @@ __init int create_simplefb(const struct screen_info *si,
memset(&res, 0, sizeof(res));
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
res.name = simplefb_resname;
- res.start = si->lfb_base;
- res.end = si->lfb_base + len - 1;
+ res.start = base;
+ res.end = res.start + length - 1;
if (res.end <= res.start)
return -EINVAL;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd4e3d4d3625..bf0c6d049080 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -853,6 +853,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ unsigned long cr0;
+
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
@@ -866,10 +868,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
return;
}
#endif
- fpu__restore(&current->thread.fpu); /* interrupts still off */
-#ifdef CONFIG_X86_32
- cond_local_irq_enable(regs);
-#endif
+
+ /* This should not happen. */
+ cr0 = read_cr0();
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, error_code);
+ }
}
NOKPROBE_SYMBOL(do_device_not_available);
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a2456d4d286a..ea7b7f9a3b9e 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -14,13 +14,55 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
if (unwind_done(state))
return 0;
+ if (state->regs && user_mode(state->regs))
+ return 0;
+
addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
addr_p);
- return __kernel_text_address(addr) ? addr : 0;
+ if (!__kernel_text_address(addr)) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n",
+ (void *)addr, addr_p, state->task->comm,
+ state->task->pid);
+ return 0;
+ }
+
+ return addr;
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+static size_t regs_size(struct pt_regs *regs)
+{
+ /* x86_32 regs from kernel mode are two words shorter: */
+ if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+ return sizeof(*regs) - 2*sizeof(long);
+
+ return sizeof(*regs);
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ unsigned long bp = (unsigned long)state->bp;
+ unsigned long regs = (unsigned long)task_pt_regs(state->task);
+
+ return bp == regs - FRAME_HEADER_SIZE;
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+
static bool update_stack_state(struct unwind_state *state, void *addr,
size_t len)
{
@@ -43,26 +85,117 @@ static bool update_stack_state(struct unwind_state *state, void *addr,
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long *next_bp;
+ struct pt_regs *regs;
+ unsigned long *next_bp, *next_frame;
+ size_t next_len;
+ enum stack_type prev_type = state->stack_info.type;
if (unwind_done(state))
return false;
- next_bp = (unsigned long *)*state->bp;
+ /* have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread_tls(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * do_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ return true;
+ }
+
+ /* get the next frame pointer */
+ if (state->regs)
+ next_bp = (unsigned long *)state->regs->bp;
+ else
+ next_bp = (unsigned long *)*state->bp;
+
+ /* is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ next_frame = (unsigned long *)regs;
+ next_len = sizeof(*regs);
+ } else {
+ next_frame = next_bp;
+ next_len = FRAME_HEADER_SIZE;
+ }
/* make sure the next frame's data is accessible */
- if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
- return false;
+ if (!update_stack_state(state, next_frame, next_len)) {
+ /*
+ * Don't warn on bad regs->bp. An interrupt in entry code
+ * might cause a false positive warning.
+ */
+ if (state->regs)
+ goto the_end;
+
+ goto bad_address;
+ }
+
+ /* Make sure it only unwinds up and doesn't overlap the last frame: */
+ if (state->stack_info.type == prev_type) {
+ if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs))
+ goto bad_address;
+
+ if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE)
+ goto bad_address;
+ }
/* move to the next frame */
- state->bp = next_bp;
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
return true;
+
+bad_address:
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_frame);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_frame);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
+ unsigned long *bp, *frame;
+ size_t len;
+
memset(state, 0, sizeof(*state));
state->task = task;
@@ -73,12 +206,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
/* set up the starting stack frame */
- state->bp = get_frame_pointer(task, regs);
+ bp = get_frame_pointer(task, regs);
+ regs = decode_frame_pointer(bp);
+ if (regs) {
+ state->regs = regs;
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ } else {
+ state->bp = bp;
+ frame = bp;
+ len = FRAME_HEADER_SIZE;
+ }
/* initialize stack info and make sure the frame data is accessible */
- get_stack_info(state->bp, state->task, &state->stack_info,
+ get_stack_info(frame, state->task, &state->stack_info,
&state->stack_mask);
- update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+ update_stack_state(state, frame, len);
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
index 9298993dc8b7..22881ddcbb9f 100644
--- a/arch/x86/kernel/unwind_guess.c
+++ b/arch/x86/kernel/unwind_guess.c
@@ -7,11 +7,15 @@
unsigned long unwind_get_return_address(struct unwind_state *state)
{
+ unsigned long addr;
+
if (unwind_done(state))
return 0;
+ addr = READ_ONCE_NOCHECK(*state->sp);
+
return ftrace_graph_ret_addr(state->task, &state->graph_idx,
- *state->sp, state->sp);
+ addr, state->sp);
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
@@ -23,9 +27,12 @@ bool unwind_next_frame(struct unwind_state *state)
return false;
do {
- for (state->sp++; state->sp < info->end; state->sp++)
- if (__kernel_text_address(*state->sp))
+ for (state->sp++; state->sp < info->end; state->sp++) {
+ unsigned long addr = READ_ONCE_NOCHECK(*state->sp);
+
+ if (__kernel_text_address(addr))
return true;
+ }
state->sp = info->next_sp;
@@ -47,7 +54,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
get_stack_info(first_frame, state->task, &state->stack_info,
&state->stack_mask);
- if (!__kernel_text_address(*first_frame))
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ if (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ !__kernel_text_address(*first_frame)))
unwind_next_frame(state);
}
EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index dbf67f64d5ec..e79f15f108a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -91,10 +91,10 @@ SECTIONS
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ _stext = .;
/* bootstrapping code */
HEAD_TEXT
. = ALIGN(8);
- _stext = .;
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT