125 files changed, 5516 insertions, 3358 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 79076d75bdbf..84c00592d359 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -75,7 +75,7 @@ apm-y				:= apm_32.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SMP)		+= smpboot.o
-obj-$(CONFIG_SMP)		+= tsc_sync.o
+obj-$(CONFIG_X86_TSC)		+= tsc_sync.o
 obj-$(CONFIG_SMP)		+= setup_percpu.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
@@ -100,8 +100,6 @@ obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_APB_TIMER)		+= apb_timer.o
 
 obj-$(CONFIG_AMD_NB)		+= amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o
-obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
 
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
@@ -123,6 +121,7 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
 ifdef CONFIG_FRAME_POINTER
 obj-y					+= unwind_frame.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index c280df6b2aa2..ea3046e0b0cf 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
 	struct acpi_hest_ia_corrected *cmc;
 	struct acpi_hest_ia_error_bank *mc_bank;
 
-	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
-		return 0;
-
 	cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
 	if (!cmc->enabled)
 		return 0;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 931ced8ca345..b2879cc23db4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
+#include <linux/efi-bgrt.h>
 
 #include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
@@ -76,6 +77,7 @@ int acpi_fix_pin2_polarity __initdata;
 static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
 #endif
 
+#ifdef CONFIG_X86_IO_APIC
 /*
  * Locks related to IOAPIC hotplug
  * Hotplug side:
@@ -88,6 +90,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
  *			->ioapic_lock
  */
 static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
 
 /* --------------------------------------------------------------------------
                               Boot-time Configuration
@@ -176,10 +179,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 		return -EINVAL;
 	}
 
+	if (!enabled) {
+		++disabled_cpus;
+		return -EINVAL;
+	}
+
 	if (boot_cpu_physical_apicid != -1U)
 		ver = boot_cpu_apic_version;
 
-	cpu = __generic_processor_info(id, ver, enabled);
+	cpu = generic_processor_info(id, ver);
 	if (cpu >= 0)
 		early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
 
@@ -707,13 +715,13 @@ static void __init acpi_set_irq_model_ioapic(void)
 #ifdef CONFIG_ACPI_HOTPLUG_CPU
 #include <acpi/processor.h>
 
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 {
 #ifdef CONFIG_ACPI_NUMA
 	int nid;
 
 	nid = acpi_get_node(handle);
-	if (nid != -1) {
+	if (nid != NUMA_NO_NODE) {
 		set_apicid_to_node(physid, nid);
 		numa_set_node(cpu, nid);
 	}
@@ -721,11 +729,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 	return 0;
 }
 
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+		 int *pcpu)
 {
 	int cpu;
 
-	cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED);
+	cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
 	if (cpu < 0) {
 		pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
 		return cpu;
@@ -928,6 +937,13 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 		x86_platform.legacy.devices.pnpbios = 0;
 	}
 
+	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+	    !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+	    x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+		pr_debug("ACPI: i8042 controller is absent\n");
+		x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+	}
+
 	if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
 		pr_debug("ACPI: not registering RTC platform device\n");
 		x86_platform.legacy.rtc = 0;
@@ -1548,6 +1564,12 @@ int __init early_acpi_boot_init(void)
 	return 0;
 }
 
+static int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	efi_bgrt_init(table);
+	return 0;
+}
+
 int __init acpi_boot_init(void)
 {
 	/* those are executed after early-quirks are executed */
@@ -1572,6 +1594,8 @@ int __init acpi_boot_init(void)
 	acpi_process_madt();
 
 	acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+	if (IS_ENABLED(CONFIG_ACPI_BGRT))
+		acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 
 	if (!acpi_noirq)
 		x86_init.pci.init = pci_acpi_init;
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index af15f4444330..8233a630280f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -12,7 +12,6 @@
 #include <linux/sched.h>
 
 #include <acpi/processor.h>
-#include <asm/acpi.h>
 #include <asm/mwait.h>
 #include <asm/special_insns.h>
 
@@ -89,7 +88,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
 	retval = 0;
 	/* If the HW does not support any sub-states in this C-state */
 	if (num_cstate_subtype == 0) {
-		pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
+		pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+				cx->address, edx_part);
 		retval = -1;
 		goto out;
 	}
@@ -104,8 +104,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
 	if (!mwait_supported[cstate_type]) {
 		mwait_supported[cstate_type] = 1;
 		printk(KERN_DEBUG
-			"Monitor-Mwait will be used to enter C-%d "
-			"state\n", cx->type);
+			"Monitor-Mwait will be used to enter C-%d state\n",
+			cx->type);
 	}
 	snprintf(cx->desc,
 			ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
@@ -166,6 +166,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
 static int __init ffh_cstate_init(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+
 	if (c->x86_vendor != X86_VENDOR_INTEL)
 		return -1;
 
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
+#ifdef CONFIG_KASAN
+	/*
+	 * The suspend path may have poisoned some areas deeper in the stack,
+	 * which we now need to unpoison.
+	 */
+	movq	%rsp, %rdi
+	call	kasan_unpoison_task_stack_below
+#endif
+
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	FRAME_END
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cb272a7a5a3..c5b8f760473c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -337,7 +337,11 @@ done:
 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 }
 
-static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 {
 	unsigned long flags;
 
@@ -346,7 +350,6 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 
 	local_irq_save(flags);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
-	sync_core();
 	local_irq_restore(flags);
 
 	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
@@ -359,9 +362,12 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
  * This implies that asymmetric systems where APs have less capabilities than
  * the boot processor are not handled. Tough. Make sure you disable such
  * features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
  */
-void __init_or_module apply_alternatives(struct alt_instr *start,
-					 struct alt_instr *end)
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+						  struct alt_instr *end)
 {
 	struct alt_instr *a;
 	u8 *instr, *replacement;
@@ -667,7 +673,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
 	unsigned long flags;
 	local_irq_save(flags);
 	memcpy(addr, opcode, len);
-	sync_core();
 	local_irq_restore(flags);
 	/* Could also do a CLFLUSH here to speed up CPU recovery; but
 	   that causes hangs on some VIA CPUs. */
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 63ff468a7986..df083efe6ee0 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
@@ -695,7 +696,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
 	.map_page			= gart_map_page,
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4fdf6230d93c..458da8509b75 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -13,8 +13,20 @@
 #include <linux/spinlock.h>
 #include <asm/amd_nb.h>
 
+#define PCI_DEVICE_ID_AMD_17H_ROOT	0x1450
+#define PCI_DEVICE_ID_AMD_17H_DF_F3	0x1463
+#define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
 static u32 *flush_words;
 
+static const struct pci_device_id amd_root_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+	{}
+};
+
 const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
 	{}
 };
-EXPORT_SYMBOL(amd_nb_misc_ids);
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
 
 static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
@@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
 	{}
 };
 
@@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
 	{ }
 };
 
-struct amd_northbridge_info amd_northbridges;
-EXPORT_SYMBOL(amd_northbridges);
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+	return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+	return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+	return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
 
 static struct pci_dev *next_northbridge(struct pci_dev *dev,
 					const struct pci_device_id *ids)
@@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
 	return dev;
 }
 
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+	struct pci_dev *root;
+	int err = -ENODEV;
+
+	if (node >= amd_northbridges.num)
+		goto out;
+
+	root = node_to_amd_nb(node)->root;
+	if (!root)
+		goto out;
+
+	mutex_lock(&smn_mutex);
+
+	err = pci_write_config_dword(root, 0x60, address);
+	if (err) {
+		pr_warn("Error programming SMN address 0x%x.\n", address);
+		goto out_unlock;
+	}
+
+	err = (write ? pci_write_config_dword(root, 0x64, *value)
+		     : pci_read_config_dword(root, 0x64, value));
+	if (err)
+		pr_warn("Error %s SMN address 0x%x.\n",
+			(write ? "writing to" : "reading from"), address);
+
+out_unlock:
+	mutex_unlock(&smn_mutex);
+
+out:
+	return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+	return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+	return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+	struct pci_dev *F4;
+	u32 ficaa;
+	int err = -ENODEV;
+
+	if (node >= amd_northbridges.num)
+		goto out;
+
+	F4 = node_to_amd_nb(node)->link;
+	if (!F4)
+		goto out;
+
+	ficaa  = 1;
+	ficaa |= reg & 0x3FC;
+	ficaa |= (func & 0x7) << 11;
+	ficaa |= instance_id << 16;
+
+	mutex_lock(&smn_mutex);
+
+	err = pci_write_config_dword(F4, 0x5C, ficaa);
+	if (err) {
+		pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+		goto out_unlock;
+	}
+
+	err = pci_read_config_dword(F4, 0x98, lo);
+	if (err)
+		pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+	mutex_unlock(&smn_mutex);
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
 int amd_cache_northbridges(void)
 {
 	u16 i = 0;
 	struct amd_northbridge *nb;
-	struct pci_dev *misc, *link;
+	struct pci_dev *root, *misc, *link;
 
-	if (amd_nb_num())
+	if (amd_northbridges.num)
 		return 0;
 
 	misc = NULL;
@@ -74,15 +198,17 @@ int amd_cache_northbridges(void)
 	if (!i)
 		return -ENODEV;
 
-	nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+	nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
 	if (!nb)
 		return -ENOMEM;
 
 	amd_northbridges.nb = nb;
 	amd_northbridges.num = i;
 
-	link = misc = NULL;
-	for (i = 0; i != amd_nb_num(); i++) {
+	link = misc = root = NULL;
+	for (i = 0; i != amd_northbridges.num; i++) {
+		node_to_amd_nb(i)->root = root =
+			next_northbridge(root, amd_root_ids);
 		node_to_amd_nb(i)->misc = misc =
 			next_northbridge(misc, amd_nb_misc_ids);
 		node_to_amd_nb(i)->link = link =
@@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
 {
 	u32 address;
 	u64 base, msr;
-	unsigned segn_busn_bits;
+	unsigned int segn_busn_bits;
 
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return NULL;
 
 	/* assume all cpus from fam10h have mmconfig */
-        if (boot_cpu_data.x86 < 0x10)
+	if (boot_cpu_data.x86 < 0x10)
 		return NULL;
 
 	address = MSR_FAM10H_MMIO_CONF_BASE;
@@ -226,14 +352,14 @@ static void amd_cache_gart(void)
 	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
-	flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+	flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
 	if (!flush_words) {
 		amd_northbridges.flags &= ~AMD_NB_GART;
 		pr_notice("Cannot initialize GART flush words, GART support disabled\n");
 		return;
 	}
 
-	for (i = 0; i != amd_nb_num(); i++)
+	for (i = 0; i != amd_northbridges.num; i++)
 		pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
 }
 
@@ -246,18 +372,20 @@ void amd_flush_garts(void)
 	if (!amd_nb_has_feature(AMD_NB_GART))
 		return;
 
-	/* Avoid races between AGP and IOMMU. In theory it's not needed
-	   but I'm not sure if the hardware won't lose flush requests
-	   when another is pending. This whole thing is so expensive anyways
-	   that it doesn't matter to serialize more. -AK */
+	/*
+	 * Avoid races between AGP and IOMMU. In theory it's not needed
+	 * but I'm not sure if the hardware won't lose flush requests
+	 * when another is pending. This whole thing is so expensive anyways
+	 * that it doesn't matter to serialize more. -AK
+	 */
 	spin_lock_irqsave(&gart_lock, flags);
 	flushed = 0;
-	for (i = 0; i < amd_nb_num(); i++) {
+	for (i = 0; i < amd_northbridges.num; i++) {
 		pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
 				       flush_words[i] | 1);
 		flushed++;
 	}
-	for (i = 0; i < amd_nb_num(); i++) {
+	for (i = 0; i < amd_northbridges.num; i++) {
 		u32 w;
 		/* Make sure the hardware actually executed the flush*/
 		for (;;) {
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 456316f6c868..65721dc73bd8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -234,7 +234,7 @@ static __init int apbt_late_init(void)
 	if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
 		!apb_timer_block_enabled)
 		return 0;
-	return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL,
+	return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
 				 apbt_cpu_dead);
 }
 fs_initcall(apbt_late_init);
@@ -247,7 +247,7 @@ void apbt_setup_secondary_clock(void) {}
 static int apbt_clocksource_register(void)
 {
 	u64 start, now;
-	cycle_t t1;
+	u64 t1;
 
 	/* Start the counter, use timer 2 as source, timer 0/1 for event */
 	dw_apb_clocksource_start(clocksource_apbt);
@@ -355,7 +355,7 @@ unsigned long apbt_quick_calibrate(void)
 {
 	int i, scale;
 	u64 old, new;
-	cycle_t t1, t2;
+	u64 t1, t2;
 	unsigned long khz = 0;
 	u32 loop, shift;
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 88c657b057e2..8ccb7ef512e0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,7 +48,6 @@
 #include <asm/io_apic.h>
 #include <asm/desc.h>
 #include <asm/hpet.h>
-#include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/time.h>
 #include <asm/smp.h>
@@ -530,18 +529,19 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
  * The local apic timer can be used for any function which is CPU local.
  */
 static struct clock_event_device lapic_clockevent = {
-	.name			= "lapic",
-	.features		= CLOCK_EVT_FEAT_PERIODIC |
-				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
-				  | CLOCK_EVT_FEAT_DUMMY,
-	.shift			= 32,
-	.set_state_shutdown	= lapic_timer_shutdown,
-	.set_state_periodic	= lapic_timer_set_periodic,
-	.set_state_oneshot	= lapic_timer_set_oneshot,
-	.set_next_event		= lapic_next_event,
-	.broadcast		= lapic_timer_broadcast,
-	.rating			= 100,
-	.irq			= -1,
+	.name				= "lapic",
+	.features			= CLOCK_EVT_FEAT_PERIODIC |
+					  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+					  | CLOCK_EVT_FEAT_DUMMY,
+	.shift				= 32,
+	.set_state_shutdown		= lapic_timer_shutdown,
+	.set_state_periodic		= lapic_timer_set_periodic,
+	.set_state_oneshot		= lapic_timer_set_oneshot,
+	.set_state_oneshot_stopped	= lapic_timer_shutdown,
+	.set_next_event			= lapic_next_event,
+	.broadcast			= lapic_timer_broadcast,
+	.rating				= 100,
+	.irq				= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
@@ -894,11 +894,13 @@ void __init setup_boot_APIC_clock(void)
 
 	/* Setup the lapic or request the broadcast */
 	setup_APIC_timer();
+	amd_e400_c1e_apic_setup();
 }
 
 void setup_secondary_APIC_clock(void)
 {
 	setup_APIC_timer();
+	amd_e400_c1e_apic_setup();
 }
 
 /*
@@ -1244,7 +1246,7 @@ static void lapic_setup_esr(void)
 /**
  * setup_local_APIC - setup the local APIC
  *
- * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Used to setup local APIC while initializing BSP or bringing up APs.
  * Always called with preemption disabled.
  */
 void setup_local_APIC(void)
@@ -1608,24 +1610,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { }
 static inline void __x2apic_enable(void) { }
 #endif /* !CONFIG_X86_X2APIC */
 
-static int __init try_to_enable_IR(void)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (!x2apic_enabled() && skip_ioapic_setup) {
-		pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
-		return -1;
-	}
-#endif
-	return irq_remapping_enable();
-}
-
 void __init enable_IR_x2apic(void)
 {
 	unsigned long flags;
 	int ret, ir_stat;
 
-	if (skip_ioapic_setup)
+	if (skip_ioapic_setup) {
+		pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
 		return;
+	}
 
 	ir_stat = irq_remapping_prepare();
 	if (ir_stat < 0 && !x2apic_supported())
@@ -1643,7 +1636,7 @@ void __init enable_IR_x2apic(void)
 
 	/* If irq_remapping_prepare() succeeded, try to enable it */
 	if (ir_stat >= 0)
-		ir_stat = try_to_enable_IR();
+		ir_stat = irq_remapping_enable();
 	/* ir_stat contains the remap mode or an error code */
 	try_to_enable_x2apic(ir_stat);
 
@@ -1863,14 +1856,14 @@ static void __smp_spurious_interrupt(u8 vector)
 		"should never happen.\n", vector, smp_processor_id());
 }
 
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_spurious_interrupt(~regs->orig_ax);
 	exiting_irq();
 }
 
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
 {
 	u8 vector = ~regs->orig_ax;
 
@@ -1921,14 +1914,14 @@ static void __smp_error_interrupt(struct pt_regs *regs)
 
 }
 
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_error_interrupt(regs);
 	exiting_irq();
 }
 
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	trace_error_apic_entry(ERROR_APIC_VECTOR);
@@ -2027,8 +2020,8 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 /*
  * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
  * contiguously, it equals to current allocated max logical CPU ID plus 1.
- * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
- * nr_logical_cpuids is nr_cpu_ids.
+ * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
+ * so the maximum of nr_logical_cpuids is nr_cpu_ids.
  *
  * NOTE: Reserve 0 for BSP.
  */
@@ -2060,17 +2053,17 @@ static int allocate_logical_cpuid(int apicid)
 
 	/* Allocate a new cpuid. */
 	if (nr_logical_cpuids >= nr_cpu_ids) {
-		WARN_ONCE(1, "Only %d processors supported."
+		WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. "
 			     "Processor %d/0x%x and the rest are ignored.\n",
-			     nr_cpu_ids - 1, nr_logical_cpuids, apicid);
-		return -1;
+			     nr_cpu_ids, nr_logical_cpuids, apicid);
+		return -EINVAL;
 	}
 
 	cpuid_to_apicid[nr_logical_cpuids] = apicid;
 	return nr_logical_cpuids++;
 }
 
-int __generic_processor_info(int apicid, int version, bool enabled)
+int generic_processor_info(int apicid, int version)
 {
 	int cpu, max = nr_cpu_ids;
 	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2093,7 +2086,7 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 	 * Since fixing handling of boot_cpu_physical_apicid requires
 	 * another discussion and tests on each platform, we leave it
 	 * for now and here we use read_apic_id() directly in this
-	 * function, generic_processor_info().
+	 * function, __generic_processor_info().
 	 */
 	if (disabled_cpu_apicid != BAD_APICID &&
 	    disabled_cpu_apicid != read_apic_id() &&
@@ -2128,11 +2121,9 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 	if (num_processors >= nr_cpu_ids) {
 		int thiscpu = max + disabled_cpus;
 
-		if (enabled) {
-			pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
-				   "reached. Processor %d/0x%x ignored.\n",
-				   max, thiscpu, apicid);
-		}
+		pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+			   "reached. Processor %d/0x%x ignored.\n",
+			   max, thiscpu, apicid);
 
 		disabled_cpus++;
 		return -EINVAL;
@@ -2159,21 +2150,6 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 	}
 
 	/*
-	 * This can happen on physical hotplug. The sanity check at boot time
-	 * is done from native_smp_prepare_cpus() after num_possible_cpus() is
-	 * established.
-	 */
-	if (topology_update_package_map(apicid, cpu) < 0) {
-		int thiscpu = max + disabled_cpus;
-
-		pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
-			   thiscpu, apicid);
-
-		disabled_cpus++;
-		return -ENOSPC;
-	}
-
-	/*
 	 * Validate version
 	 */
 	if (version == 0x0) {
@@ -2199,23 +2175,13 @@ int __generic_processor_info(int apicid, int version, bool enabled)
 		apic->x86_32_early_logical_apicid(cpu);
 #endif
 	set_cpu_possible(cpu, true);
-
-	if (enabled) {
-		num_processors++;
-		physid_set(apicid, phys_cpu_present_map);
-		set_cpu_present(cpu, true);
-	} else {
-		disabled_cpus++;
-	}
+	physid_set(apicid, phys_cpu_present_map);
+	set_cpu_present(cpu, true);
+	num_processors++;
 
 	return cpu;
 }
 
-int generic_processor_info(int apicid, int version)
-{
-	return __generic_processor_info(apicid, version, true);
-}
-
 int hard_smp_processor_id(void)
 {
 	return read_apic_id();
@@ -2263,6 +2229,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
 	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
 		/* Should happen once for each apic */
 		WARN_ON((*drv)->eoi_write == eoi_write);
+		(*drv)->native_eoi_write = (*drv)->eoi_write;
 		(*drv)->eoi_write = eoi_write;
 	}
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 48e6d84f173e..347bb9f65737 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,7 +48,6 @@
 #include <linux/bootmem.h>
 
 #include <asm/irqdomain.h>
-#include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/cpu.h>
@@ -1108,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 
 	ioapic = mp_find_ioapic(gsi);
 	if (ioapic < 0)
-		return -1;
+		return -ENODEV;
 
 	pin = mp_find_ioapic_pin(ioapic, gsi);
 	idx = find_irq_entry(ioapic, pin, mp_INT);
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
-		return -1;
+		return -ENODEV;
 
 	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
@@ -1876,6 +1875,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -1887,6 +1887,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
 	.irq_ack		= irq_chip_ack_parent,
 	.irq_eoi		= ioapic_ir_ack_level,
 	.irq_set_affinity	= ioapic_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -2116,6 +2117,7 @@ static inline void __init check_timer(void)
 			if (idx != -1 && irq_trigger(idx))
 				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_deactivate_irq(irq_data);
 		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
@@ -2137,6 +2139,7 @@ static inline void __init check_timer(void)
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_deactivate_irq(irq_data);
 		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 015bbf30e3e3..c61aec7e65f4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (domain == NULL)
 		return -ENOSYS;
 
-	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+	return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
 }
 
 void native_teardown_msi_irq(unsigned int irq)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 5d30c5e42bb1..f3557a1eb562 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
 		__send_cleanup_vector(data);
 }
 
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 200af5ae9662..5a35f208ed95 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -191,7 +191,7 @@ static int x2apic_cluster_probe(void)
 	if (!x2apic_mode)
 		return 0;
 
-	ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+	ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
 				x2apic_prepare_cpu, x2apic_dead_cpu);
 	if (ret < 0) {
 		pr_err("Failed to register X2APIC_PREPARE\n");
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 35690a168cf7..86f20cc0a65e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -41,40 +41,44 @@
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
-#define PR_DEVEL(fmt, args...)	pr_devel("%s: " fmt, __func__, args)
-
-static enum uv_system_type uv_system_type;
-static u64 gru_start_paddr, gru_end_paddr;
-static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
-static u64 gru_dist_lmask, gru_dist_umask;
-static union uvh_apicid uvh_apicid;
-
-/* info derived from CPUID */
+static enum uv_system_type	uv_system_type;
+static bool			uv_hubless_system;
+static u64			gru_start_paddr, gru_end_paddr;
+static u64			gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64			gru_dist_lmask, gru_dist_umask;
+static union uvh_apicid		uvh_apicid;
+
+/* Information derived from CPUID: */
 static struct {
 	unsigned int apicid_shift;
 	unsigned int apicid_mask;
 	unsigned int socketid_shift;	/* aka pnode_shift for UV1/2/3 */
 	unsigned int pnode_mask;
 	unsigned int gpa_shift;
+	unsigned int gnode_shift;
 } uv_cpuid;
 
 int uv_min_hub_revision_id;
 EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+
 unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 
 static struct apic apic_x2apic_uv_x;
 static struct uv_hub_info_s uv_hub_info_node0;
 
-/* Set this to use hardware error handler instead of kernel panic */
+/* Set this to use hardware error handler instead of kernel panic: */
 static int disable_uv_undefined_panic = 1;
+
 unsigned long uv_undefined(char *str)
 {
 	if (likely(!disable_uv_undefined_panic))
 		panic("UV: error: undefined MMR: %s\n", str);
 	else
 		pr_crit("UV: error: undefined MMR: %s\n", str);
-	return ~0ul;	/* cause a machine fault  */
+
+	/* Cause a machine fault: */
+	return ~0ul;
 }
 EXPORT_SYMBOL(uv_undefined);
 
@@ -85,18 +89,19 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
 	mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
 	val = *mmr;
 	early_iounmap(mmr, sizeof(*mmr));
+
 	return val;
 }
 
 static inline bool is_GRU_range(u64 start, u64 end)
 {
 	if (gru_dist_base) {
-		u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */
-		u64 sl = start & gru_dist_lmask; /* base offset bits */
+		u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
+		u64 sl = start & gru_dist_lmask; /* Base offset bits */
 		u64 eu = end & gru_dist_umask;
 		u64 el = end & gru_dist_lmask;
 
-		/* Must reside completely within a single GRU range */
+		/* Must reside completely within a single GRU range: */
 		return (sl == gru_dist_base && el == gru_dist_base &&
 			su >= gru_first_node_paddr &&
 			su <= gru_last_node_paddr &&
@@ -133,13 +138,14 @@ static int __init early_get_pnodeid(void)
 		break;
 	case UV4_HUB_PART_NUMBER:
 		uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+		uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
 		break;
 	}
 
 	uv_hub_info->hub_revision = uv_min_hub_revision_id;
 	uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
 	pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
-	uv_cpuid.gpa_shift = 46;	/* default unless changed */
+	uv_cpuid.gpa_shift = 46;	/* Default unless changed */
 
 	pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
 		node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
@@ -147,11 +153,12 @@ static int __init early_get_pnodeid(void)
 	return pnode;
 }
 
-/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
-#define SMT_LEVEL	0	/* leaf 0xb SMT level */
-#define INVALID_TYPE	0	/* leaf 0xb sub-leaf types */
-#define SMT_TYPE	1
-#define CORE_TYPE	2
+/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+
+#define SMT_LEVEL			0	/* Leaf 0xb SMT level */
+#define INVALID_TYPE			0	/* Leaf 0xb sub-leaf types */
+#define SMT_TYPE			1
+#define CORE_TYPE			2
 #define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
 #define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
 
@@ -165,11 +172,13 @@ static void set_x2apic_bits(void)
 		pr_info("UV: CPU does not have CPUID.11\n");
 		return;
 	}
+
 	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
 	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
 		pr_info("UV: CPUID.11 not implemented\n");
 		return;
 	}
+
 	sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
 	sub_index = 1;
 	do {
@@ -180,8 +189,9 @@ static void set_x2apic_bits(void)
 		}
 		sub_index++;
 	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-	uv_cpuid.apicid_shift = 0;
-	uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+
+	uv_cpuid.apicid_shift	= 0;
+	uv_cpuid.apicid_mask	= (~(-1 << sid_shift));
 	uv_cpuid.socketid_shift = sid_shift;
 }
 
@@ -192,10 +202,8 @@ static void __init early_get_apic_socketid_shift(void)
 
 	set_x2apic_bits();
 
-	pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n",
-		uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
-	pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n",
-		uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+	pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+	pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
 }
 
 /*
@@ -208,10 +216,8 @@ static void __init uv_set_apicid_hibit(void)
 	union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
 
 	if (is_uv1_hub()) {
-		apicid_mask.v =
-			uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
-		uv_apicid_hibits =
-			apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+		apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+		uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
 	}
 }
 
@@ -220,20 +226,26 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	int pnodeid;
 	int uv_apic;
 
-	if (strncmp(oem_id, "SGI", 3) != 0)
+	if (strncmp(oem_id, "SGI", 3) != 0) {
+		if (strncmp(oem_id, "NSGI", 4) == 0) {
+			uv_hubless_system = true;
+			pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
+				oem_id, oem_table_id);
+		}
 		return 0;
+	}
 
 	if (numa_off) {
 		pr_err("UV: NUMA is off, disabling UV support\n");
 		return 0;
 	}
 
-	/* Setup early hub type field in uv_hub_info for Node 0 */
+	/* Set up early hub type field in uv_hub_info for Node 0 */
 	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
 
 	/*
 	 * Determine UV arch type.
-	 *   SGI: UV100/1000
+	 *   SGI:  UV100/1000
 	 *   SGI2: UV2000/3000
 	 *   SGI3: UV300 (truncated to 4 chars because of different varieties)
 	 *   SGI4: UV400 (truncated to 4 chars because of different varieties)
@@ -249,31 +261,32 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 	pnodeid = early_get_pnodeid();
 	early_get_apic_socketid_shift();
-	x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
+
+	x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
 	x86_platform.nmi_init = uv_nmi_init;
 
-	if (!strcmp(oem_table_id, "UVX")) {		/* most common */
+	if (!strcmp(oem_table_id, "UVX")) {
+		/* This is the most common hardware variant: */
 		uv_system_type = UV_X2APIC;
 		uv_apic = 0;
 
-	} else if (!strcmp(oem_table_id, "UVH")) {	/* only UV1 systems */
+	} else if (!strcmp(oem_table_id, "UVH")) {
+		/* Only UV1 systems: */
 		uv_system_type = UV_NON_UNIQUE_APIC;
-		__this_cpu_write(x2apic_extra_bits,
-			pnodeid << uvh_apicid.s.pnode_shift);
+		__this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
 		uv_set_apicid_hibit();
 		uv_apic = 1;
 
-	} else	if (!strcmp(oem_table_id, "UVL")) {	/* only used for */
-		uv_system_type = UV_LEGACY_APIC;	/* very small systems */
+	} else if (!strcmp(oem_table_id, "UVL")) {
+		/* Only used for very small systems:  */
+		uv_system_type = UV_LEGACY_APIC;
 		uv_apic = 0;
 
 	} else {
 		goto badbios;
 	}
 
-	pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
-		oem_id, oem_table_id, uv_system_type,
-		uv_min_hub_revision_id, uv_apic);
+	pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
 
 	return uv_apic;
 
@@ -294,6 +307,12 @@ int is_uv_system(void)
 }
 EXPORT_SYMBOL_GPL(is_uv_system);
 
+int is_uv_hubless(void)
+{
+	return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(is_uv_hubless);
+
 void **__uv_hub_info_list;
 EXPORT_SYMBOL_GPL(__uv_hub_info_list);
 
@@ -306,16 +325,18 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
 unsigned long sn_rtc_cycles_per_second;
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
-/* the following values are used for the per node hub info struct */
-static __initdata unsigned short *_node_to_pnode;
-static __initdata unsigned short _min_socket, _max_socket;
-static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
-static __initdata struct uv_gam_range_entry *uv_gre_table;
-static __initdata struct uv_gam_parameters *uv_gp_table;
-static __initdata unsigned short *_socket_to_node;
-static __initdata unsigned short *_socket_to_pnode;
-static __initdata unsigned short *_pnode_to_socket;
-static __initdata struct uv_gam_range_s *_gr_table;
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short		*_node_to_pnode;
+static __initdata unsigned short		_min_socket, _max_socket;
+static __initdata unsigned short		_min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry	*uv_gre_table;
+static __initdata struct uv_gam_parameters	*uv_gp_table;
+static __initdata unsigned short		*_socket_to_node;
+static __initdata unsigned short		*_socket_to_pnode;
+static __initdata unsigned short		*_pnode_to_socket;
+
+static __initdata struct uv_gam_range_s		*_gr_table;
+
 #define	SOCK_EMPTY	((unsigned short)~0)
 
 extern int uv_hub_info_version(void)
@@ -324,7 +345,7 @@ extern int uv_hub_info_version(void)
 }
 EXPORT_SYMBOL(uv_hub_info_version);
 
-/* Build GAM range lookup table */
+/* Build GAM range lookup table: */
 static __init void build_uv_gr_table(void)
 {
 	struct uv_gam_range_entry *gre = uv_gre_table;
@@ -342,25 +363,24 @@ static __init void build_uv_gr_table(void)
 
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
-			if (!ram_limit) {   /* mark hole between ram/non-ram */
+			if (!ram_limit) {
+				/* Mark hole between RAM/non-RAM: */
 				ram_limit = last_limit;
 				last_limit = gre->limit;
 				lsid++;
 				continue;
 			}
 			last_limit = gre->limit;
-			pr_info("UV: extra hole in GAM RE table @%d\n",
-				(int)(gre - uv_gre_table));
+			pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
 			continue;
 		}
 		if (_max_socket < gre->sockid) {
-			pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n",
-				gre->sockid, _max_socket,
-				(int)(gre - uv_gre_table));
+			pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
 			continue;
 		}
 		sid = gre->sockid - _min_socket;
-		if (lsid < sid) {		/* new range */
+		if (lsid < sid) {
+			/* New range: */
 			grt = &_gr_table[indx];
 			grt->base = lindx;
 			grt->nasid = gre->nasid;
@@ -369,27 +389,32 @@ static __init void build_uv_gr_table(void)
 			lindx = indx++;
 			continue;
 		}
-		if (lsid == sid && !ram_limit) {	/* update range */
-			if (grt->limit == last_limit) {	/* .. if contiguous */
+		/* Update range: */
+		if (lsid == sid && !ram_limit) {
+			/* .. if contiguous: */
+			if (grt->limit == last_limit) {
 				grt->limit = last_limit = gre->limit;
 				continue;
 			}
 		}
-		if (!ram_limit) {		/* non-contiguous ram range */
+		/* Non-contiguous RAM range: */
+		if (!ram_limit) {
 			grt++;
 			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			continue;
 		}
-		grt++;				/* non-contiguous/non-ram */
-		grt->base = grt - _gr_table;	/* base is this entry */
+		/* Non-contiguous/non-RAM: */
+		grt++;
+		/* base is this entry */
+		grt->base = grt - _gr_table;
 		grt->nasid = gre->nasid;
 		grt->limit = last_limit = gre->limit;
 		lsid++;
 	}
 
-	/* shorten table if possible */
+	/* Shorten table if possible */
 	grt++;
 	i = grt - _gr_table;
 	if (i < _gr_table_len) {
@@ -403,16 +428,15 @@ static __init void build_uv_gr_table(void)
 		}
 	}
 
-	/* display resultant gam range table */
+	/* Display resultant GAM range table: */
 	for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+		unsigned long start, end;
 		int gb = grt->base;
-		unsigned long start = gb < 0 ?  0 :
-			(unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
-		unsigned long end =
-			(unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
 
-		pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n",
-			i, grt->nasid, start, end, gb);
+		start = gb < 0 ?  0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+		end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+		pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
 	}
 }
 
@@ -423,16 +447,19 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 
 	pnode = uv_apicid_to_pnode(phys_apicid);
 	phys_apicid |= uv_apicid_hibits;
+
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_INIT;
+
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_STARTUP;
+
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 
 	return 0;
@@ -566,7 +593,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
 	.apic_id_registered		= uv_apic_id_registered,
 
 	.irq_delivery_mode		= dest_Fixed,
-	.irq_dest_mode			= 0, /* physical */
+	.irq_dest_mode			= 0, /* Physical */
 
 	.target_cpus			= online_target_cpus,
 	.disable_esr			= 0,
@@ -627,23 +654,22 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 		switch (i) {
 		case 0:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
 			break;
 		case 1:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
 			break;
 		case 2:
 			m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
-			m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+			m_overlay  = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
 			break;
 		}
 		alias.v = uv_read_local_mmr(m_overlay);
 		if (alias.s.enable && alias.s.base == 0) {
 			*size = (1UL << alias.s.m_alias);
 			redirect.v = uv_read_local_mmr(m_redirect);
-			*base = (unsigned long)redirect.s.dest_base
-							<< DEST_SHIFT;
+			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
 			return;
 		}
 	}
@@ -652,8 +678,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
 
 enum map_type {map_wb, map_uc};
 
-static __init void map_high(char *id, unsigned long base, int pshift,
-			int bshift, int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
 {
 	unsigned long bytes, paddr;
 
@@ -678,16 +703,19 @@ static __init void map_gru_distributed(unsigned long c)
 	int nid;
 
 	gru.v = c;
-	/* only base bits 42:28 relevant in dist mode */
+
+	/* Only base bits 42:28 relevant in dist mode */
 	gru_dist_base = gru.v & 0x000007fff0000000UL;
 	if (!gru_dist_base) {
 		pr_info("UV: Map GRU_DIST base address NULL\n");
 		return;
 	}
+
 	bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
 	gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
 	gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
 	gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+
 	for_each_online_node(nid) {
 		paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
 				gru_dist_base;
@@ -695,11 +723,12 @@ static __init void map_gru_distributed(unsigned long c)
 		gru_first_node_paddr = min(paddr, gru_first_node_paddr);
 		gru_last_node_paddr = max(paddr, gru_last_node_paddr);
 	}
+
 	/* Save upper (63:M) bits of address only for is_GRU_range */
 	gru_first_node_paddr &= gru_dist_umask;
 	gru_last_node_paddr &= gru_dist_umask;
-	pr_debug("UV: Map GRU_DIST base 0x%016llx  0x%016llx - 0x%016llx\n",
-		gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+
+	pr_debug("UV: Map GRU_DIST base 0x%016llx  0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
 }
 
 static __init void map_gru_high(int max_pnode)
@@ -719,6 +748,7 @@ static __init void map_gru_high(int max_pnode)
 		map_gru_distributed(gru.v);
 		return;
 	}
+
 	base = (gru.v & mask) >> shift;
 	map_high("GRU", base, shift, shift, max_pnode, map_wb);
 	gru_start_paddr = ((u64)base << shift);
@@ -772,8 +802,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 
 	id = mmiohs[index].id;
 	overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
-	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
-		id, overlay.v, overlay.s3.base, overlay.s3.m_io);
+
+	pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io);
 	if (!overlay.s3.enable) {
 		pr_info("UV: %s disabled\n", id);
 		return;
@@ -784,7 +814,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 	m_io = overlay.s3.m_io;
 	mmr = mmiohs[index].redirect;
 	n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
-	min_pnode *= 2;				/* convert to NASID */
+	/* Convert to NASID: */
+	min_pnode *= 2;
 	max_pnode *= 2;
 	max_io = lnasid = fi = li = -1;
 
@@ -793,16 +824,18 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 
 		redirect.v = uv_read_local_mmr(mmr + i * 8);
 		nasid = redirect.s3.nasid;
+		/* Invalid NASID: */
 		if (nasid < min_pnode || max_pnode < nasid)
-			nasid = -1;		/* invalid NASID */
+			nasid = -1;
 
 		if (nasid == lnasid) {
 			li = i;
-			if (i != n-1)		/* last entry check */
+			/* Last entry check: */
+			if (i != n-1)
 				continue;
 		}
 
-		/* check if we have a cached (or last) redirect to print */
+		/* Check if we have a cached (or last) redirect to print: */
 		if (lnasid != -1 || (i == n-1 && nasid != -1))  {
 			unsigned long addr1, addr2;
 			int f, l;
@@ -814,12 +847,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 				f = fi;
 				l = li;
 			}
-			addr1 = (base << shift) +
-				f * (1ULL << m_io);
-			addr2 = (base << shift) +
-				(l + 1) * (1ULL << m_io);
-			pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
-				id, fi, li, lnasid, addr1, addr2);
+			addr1 = (base << shift) + f * (1ULL << m_io);
+			addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+			pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
 			if (max_io < l)
 				max_io = l;
 		}
@@ -827,8 +857,7 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
 		lnasid = nasid;
 	}
 
-	pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
-		id, base, shift, m_io, max_io);
+	pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
 
 	if (max_io >= 0)
 		map_high(id, base, shift, m_io, max_io, map_uc);
@@ -841,36 +870,35 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
 	int shift, enable, m_io, n_io;
 
 	if (is_uv3_hub() || is_uv4_hub()) {
-		/* Map both MMIOH Regions */
+		/* Map both MMIOH regions: */
 		map_mmioh_high_uv3(0, min_pnode, max_pnode);
 		map_mmioh_high_uv3(1, min_pnode, max_pnode);
 		return;
 	}
 
 	if (is_uv1_hub()) {
-		mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
-		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		mmioh.v = uv_read_local_mmr(mmr);
-		enable = !!mmioh.s1.enable;
-		base = mmioh.s1.base;
-		m_io = mmioh.s1.m_io;
-		n_io = mmioh.s1.n_io;
+		mmr	= UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+		shift	= UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		mmioh.v	= uv_read_local_mmr(mmr);
+		enable	= !!mmioh.s1.enable;
+		base	= mmioh.s1.base;
+		m_io	= mmioh.s1.m_io;
+		n_io	= mmioh.s1.n_io;
 	} else if (is_uv2_hub()) {
-		mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
-		shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
-		mmioh.v = uv_read_local_mmr(mmr);
-		enable = !!mmioh.s2.enable;
-		base = mmioh.s2.base;
-		m_io = mmioh.s2.m_io;
-		n_io = mmioh.s2.n_io;
-	} else
+		mmr	= UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+		shift	= UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		mmioh.v	= uv_read_local_mmr(mmr);
+		enable	= !!mmioh.s2.enable;
+		base	= mmioh.s2.base;
+		m_io	= mmioh.s2.m_io;
+		n_io	= mmioh.s2.n_io;
+	} else {
 		return;
+	}
 
 	if (enable) {
 		max_pnode &= (1 << n_io) - 1;
-		pr_info(
-		    "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
-			base, shift, m_io, n_io, max_pnode);
+		pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
 		map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
 	} else {
 		pr_info("UV: MMIOH disabled\n");
@@ -888,16 +916,16 @@ static __init void uv_rtc_init(void)
 	long status;
 	u64 ticks_per_sec;
 
-	status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
-					&ticks_per_sec);
+	status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
 	if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
-		printk(KERN_WARNING
-			"unable to determine platform RTC clock frequency, "
-			"guessing.\n");
-		/* BIOS gives wrong value for clock freq. so guess */
+		pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+		/* BIOS gives wrong value for clock frequency, so guess: */
 		sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
-	} else
+	} else {
 		sn_rtc_cycles_per_second = ticks_per_sec;
+	}
 }
 
 /*
@@ -908,19 +936,19 @@ static void uv_heartbeat(unsigned long ignored)
 	struct timer_list *timer = &uv_scir_info->timer;
 	unsigned char bits = uv_scir_info->state;
 
-	/* flip heartbeat bit */
+	/* Flip heartbeat bit: */
 	bits ^= SCIR_CPU_HEARTBEAT;
 
-	/* is this cpu idle? */
+	/* Is this CPU idle? */
 	if (idle_cpu(raw_smp_processor_id()))
 		bits &= ~SCIR_CPU_ACTIVITY;
 	else
 		bits |= SCIR_CPU_ACTIVITY;
 
-	/* update system controller interface reg */
+	/* Update system controller interface reg: */
 	uv_set_scir_bits(bits);
 
-	/* enable next timer period */
+	/* Enable next timer period: */
 	mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
 }
 
@@ -935,7 +963,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
 		add_timer_on(timer, cpu);
 		uv_cpu_scir_info(cpu)->enabled = 1;
 
-		/* also ensure that boot cpu is enabled */
+		/* Also ensure that boot CPU is enabled: */
 		cpu = 0;
 	}
 	return 0;
@@ -968,9 +996,11 @@ static __init int uv_init_heartbeat(void)
 {
 	int cpu;
 
-	if (is_uv_system())
+	if (is_uv_system()) {
 		for_each_online_cpu(cpu)
 			uv_heartbeat_enable(cpu);
+	}
+
 	return 0;
 }
 
@@ -979,14 +1009,10 @@ late_initcall(uv_init_heartbeat);
 #endif /* !CONFIG_HOTPLUG_CPU */
 
 /* Direct Legacy VGA I/O traffic to designated IOH */
-int uv_set_vga_state(struct pci_dev *pdev, bool decode,
-		      unsigned int command_bits, u32 flags)
+int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
 {
 	int domain, bus, rc;
 
-	PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
-			pdev->devfn, decode, command_bits, flags);
-
 	if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
 		return 0;
 
@@ -997,13 +1023,12 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
 	bus = pdev->bus->number;
 
 	rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
-	PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
 
 	return rc;
 }
 
 /*
- * Called on each cpu to initialize the per_cpu UV data area.
+ * Called on each CPU to initialize the per_cpu UV data area.
  * FIXME: hotplug not supported yet
  */
 void uv_cpu_init(void)
@@ -1030,90 +1055,80 @@ static void get_mn(struct mn *mnp)
 	union uvh_rh_gam_config_mmr_u m_n_config;
 	union uv3h_gr0_gam_gr_config_u m_gr_config;
 
-	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
-	mnp->n_val = m_n_config.s.n_skt;
+	/* Make sure the whole structure is well initialized: */
+	memset(mnp, 0, sizeof(*mnp));
+
+	m_n_config.v	= uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+	mnp->n_val	= m_n_config.s.n_skt;
+
 	if (is_uv4_hub()) {
-		mnp->m_val = 0;
-		mnp->n_lshift = 0;
+		mnp->m_val	= 0;
+		mnp->n_lshift	= 0;
 	} else if (is_uv3_hub()) {
-		mnp->m_val = m_n_config.s3.m_skt;
-		m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
-		mnp->n_lshift = m_gr_config.s3.m_skt;
+		mnp->m_val	= m_n_config.s3.m_skt;
+		m_gr_config.v	= uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+		mnp->n_lshift	= m_gr_config.s3.m_skt;
 	} else if (is_uv2_hub()) {
-		mnp->m_val = m_n_config.s2.m_skt;
-		mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+		mnp->m_val	= m_n_config.s2.m_skt;
+		mnp->n_lshift	= mnp->m_val == 40 ? 40 : 39;
 	} else if (is_uv1_hub()) {
-		mnp->m_val = m_n_config.s1.m_skt;
-		mnp->n_lshift = mnp->m_val;
+		mnp->m_val	= m_n_config.s1.m_skt;
+		mnp->n_lshift	= mnp->m_val;
 	}
 	mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
 }
 
-void __init uv_init_hub_info(struct uv_hub_info_s *hub_info)
+void __init uv_init_hub_info(struct uv_hub_info_s *hi)
 {
-	struct mn mn = {0};	/* avoid unitialized warnings */
 	union uvh_node_id_u node_id;
+	struct mn mn;
 
 	get_mn(&mn);
-	hub_info->m_val = mn.m_val;
-	hub_info->n_val = mn.n_val;
-	hub_info->m_shift = mn.m_shift;
-	hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
-
-	hub_info->hub_revision = uv_hub_info->hub_revision;
-	hub_info->pnode_mask = uv_cpuid.pnode_mask;
-	hub_info->min_pnode = _min_pnode;
-	hub_info->min_socket = _min_socket;
-	hub_info->pnode_to_socket = _pnode_to_socket;
-	hub_info->socket_to_node = _socket_to_node;
-	hub_info->socket_to_pnode = _socket_to_pnode;
-	hub_info->gr_table_len = _gr_table_len;
-	hub_info->gr_table = _gr_table;
-	hub_info->gpa_mask = mn.m_val ?
+	hi->gpa_mask = mn.m_val ?
 		(1UL << (mn.m_val + mn.n_val)) - 1 :
 		(1UL << uv_cpuid.gpa_shift) - 1;
 
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	hub_info->gnode_extra =
-		(node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1;
-
-	hub_info->gnode_upper =
-		((unsigned long)hub_info->gnode_extra << mn.m_val);
+	hi->m_val		= mn.m_val;
+	hi->n_val		= mn.n_val;
+	hi->m_shift		= mn.m_shift;
+	hi->n_lshift		= mn.n_lshift ? mn.n_lshift : 0;
+	hi->hub_revision	= uv_hub_info->hub_revision;
+	hi->pnode_mask		= uv_cpuid.pnode_mask;
+	hi->min_pnode		= _min_pnode;
+	hi->min_socket		= _min_socket;
+	hi->pnode_to_socket	= _pnode_to_socket;
+	hi->socket_to_node	= _socket_to_node;
+	hi->socket_to_pnode	= _socket_to_pnode;
+	hi->gr_table_len	= _gr_table_len;
+	hi->gr_table		= _gr_table;
+
+	node_id.v		= uv_read_local_mmr(UVH_NODE_ID);
+	uv_cpuid.gnode_shift	= max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+	hi->gnode_extra		= (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+	if (mn.m_val)
+		hi->gnode_upper	= (u64)hi->gnode_extra << mn.m_val;
 
 	if (uv_gp_table) {
-		hub_info->global_mmr_base = uv_gp_table->mmr_base;
-		hub_info->global_mmr_shift = uv_gp_table->mmr_shift;
-		hub_info->global_gru_base = uv_gp_table->gru_base;
-		hub_info->global_gru_shift = uv_gp_table->gru_shift;
-		hub_info->gpa_shift = uv_gp_table->gpa_shift;
-		hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1;
+		hi->global_mmr_base	= uv_gp_table->mmr_base;
+		hi->global_mmr_shift	= uv_gp_table->mmr_shift;
+		hi->global_gru_base	= uv_gp_table->gru_base;
+		hi->global_gru_shift	= uv_gp_table->gru_shift;
+		hi->gpa_shift		= uv_gp_table->gpa_shift;
+		hi->gpa_mask		= (1UL << hi->gpa_shift) - 1;
 	} else {
-		hub_info->global_mmr_base =
-			uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
-					~UV_MMR_ENABLE;
-		hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+		hi->global_mmr_base	= uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+		hi->global_mmr_shift	= _UV_GLOBAL_MMR64_PNODE_SHIFT;
 	}
 
-	get_lowmem_redirect(
-		&hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top);
-
-	hub_info->apic_pnode_shift = uv_cpuid.socketid_shift;
-
-	/* show system specific info */
-	pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n",
-		hub_info->n_val, hub_info->m_val,
-		hub_info->m_shift, hub_info->n_lshift);
-
-	pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n",
-		hub_info->gpa_mask, hub_info->gpa_shift,
-		hub_info->pnode_mask, hub_info->apic_pnode_shift);
+	get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
 
-	pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n",
-		hub_info->global_mmr_base, hub_info->global_mmr_shift,
-		hub_info->global_gru_base, hub_info->global_gru_shift);
+	hi->apic_pnode_shift = uv_cpuid.socketid_shift;
 
-	pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n",
-		hub_info->gnode_upper, hub_info->gnode_extra);
+	/* Show system specific info: */
+	pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+	pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+	pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+	pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
 }
 
 static void __init decode_gam_params(unsigned long ptr)
@@ -1139,12 +1154,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (!index) {
 			pr_info("UV: GAM Range Table...\n");
-			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n",
-				"Range", "", "Size", "Type", "NASID",
-				"SID", "PN");
+			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
 		}
-		pr_info(
-		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
+		pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
 			index++,
 			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
 			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
@@ -1162,29 +1174,32 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 		if (pnode_max < gre->pnode)
 			pnode_max = gre->pnode;
 	}
-	_min_socket = sock_min;
-	_max_socket = sock_max;
-	_min_pnode = pnode_min;
-	_max_pnode = pnode_max;
-	_gr_table_len = index;
-	pr_info(
-	"UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n",
-		index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+	_min_socket	= sock_min;
+	_max_socket	= sock_max;
+	_min_pnode	= pnode_min;
+	_max_pnode	= pnode_max;
+	_gr_table_len	= index;
+
+	pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
 }
 
-static void __init decode_uv_systab(void)
+static int __init decode_uv_systab(void)
 {
 	struct uv_systab *st;
 	int i;
 
+	if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+		return 0;	/* No extended UVsystab required */
+
 	st = uv_systab;
-	if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub())
-		return;
-	if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) {
-		pr_crit(
-		"UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n",
-			st->revision, UV_SYSTAB_VERSION_UV4_LATEST);
-		BUG();
+	if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+		int rev = st ? st->revision : 0;
+
+		pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
+		pr_err("UV: Cannot support UV operations, switching to generic PC\n");
+		uv_system_type = UV_NONE;
+
+		return -EINVAL;
 	}
 
 	for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
@@ -1205,10 +1220,11 @@ static void __init decode_uv_systab(void)
 			break;
 		}
 	}
+	return 0;
 }
 
 /*
- * Setup physical blade translations from UVH_NODE_PRESENT_TABLE
+ * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
  * .. NB: UVH_NODE_PRESENT_TABLE is going away,
  * .. being replaced by GAM Range Table
  */
@@ -1244,14 +1260,13 @@ static void __init build_socket_tables(void)
 	if (!gre) {
 		if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
 			pr_info("UV: No UVsystab socket table, ignoring\n");
-			return;		/* not required */
+			return;
 		}
-		pr_crit(
-		"UV: Error: UVsystab address translations not available!\n");
+		pr_crit("UV: Error: UVsystab address translations not available!\n");
 		BUG();
 	}
 
-	/* build socket id -> node id, pnode */
+	/* Build socket id -> node id, pnode */
 	num = maxsock - minsock + 1;
 	bytes = num * sizeof(_socket_to_node[0]);
 	_socket_to_node = kmalloc(bytes, GFP_KERNEL);
@@ -1268,27 +1283,27 @@ static void __init build_socket_tables(void)
 	for (i = 0; i < nump; i++)
 		_pnode_to_socket[i] = SOCK_EMPTY;
 
-	/* fill in pnode/node/addr conversion list values */
+	/* Fill in pnode/node/addr conversion list values: */
 	pr_info("UV: GAM Building socket/pnode conversion tables\n");
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
 			continue;
 		i = gre->sockid - minsock;
+		/* Duplicate: */
 		if (_socket_to_pnode[i] != SOCK_EMPTY)
-			continue;	/* duplicate */
+			continue;
 		_socket_to_pnode[i] = gre->pnode;
 
 		i = gre->pnode - minpnode;
 		_pnode_to_socket[i] = gre->sockid;
 
-		pr_info(
-		"UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+		pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
 			gre->sockid, gre->type, gre->nasid,
 			_socket_to_pnode[gre->sockid - minsock],
 			_pnode_to_socket[gre->pnode - minpnode]);
 	}
 
-	/* Set socket -> node values */
+	/* Set socket -> node values: */
 	lnid = -1;
 	for_each_present_cpu(cpu) {
 		int nid = cpu_to_node(cpu);
@@ -1304,7 +1319,7 @@ static void __init build_socket_tables(void)
 			sockid, apicid, nid);
 	}
 
-	/* Setup physical blade to pnode translation from GAM Range Table */
+	/* Set up physical blade to pnode translation from GAM Range Table: */
 	bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
 	_node_to_pnode = kmalloc(bytes, GFP_KERNEL);
 	BUG_ON(!_node_to_pnode);
@@ -1314,8 +1329,7 @@ static void __init build_socket_tables(void)
 
 		for (sockid = minsock; sockid <= maxsock; sockid++) {
 			if (lnid == _socket_to_node[sockid - minsock]) {
-				_node_to_pnode[lnid] =
-					_socket_to_pnode[sockid - minsock];
+				_node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
 				break;
 			}
 		}
@@ -1332,8 +1346,7 @@ static void __init build_socket_tables(void)
 	pr_info("UV: Checking socket->node/pnode for identity maps\n");
 	if (minsock == 0) {
 		for (i = 0; i < num; i++)
-			if (_socket_to_node[i] == SOCK_EMPTY ||
-				i != _socket_to_node[i])
+			if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
 				break;
 		if (i >= num) {
 			kfree(_socket_to_node);
@@ -1354,7 +1367,7 @@ static void __init build_socket_tables(void)
 	}
 }
 
-void __init uv_system_init(void)
+static void __init uv_system_init_hub(void)
 {
 	struct uv_hub_info_s hub_info = {0};
 	int bytes, cpu, nodeid;
@@ -1372,8 +1385,13 @@ void __init uv_system_init(void)
 
 	map_low_mmrs();
 
-	uv_bios_init();			/* get uv_systab for decoding */
-	decode_uv_systab();
+	/* Get uv_systab for decoding: */
+	uv_bios_init();
+
+	/* If there's an UVsystab problem then abort UV init: */
+	if (decode_uv_systab() < 0)
+		return;
+
 	build_socket_tables();
 	build_uv_gr_table();
 	uv_init_hub_info(&hub_info);
@@ -1381,14 +1399,10 @@ void __init uv_system_init(void)
 	if (!_node_to_pnode)
 		boot_init_possible_blades(&hub_info);
 
-	/* uv_num_possible_blades() is really the hub count */
-	pr_info("UV: Found %d hubs, %d nodes, %d cpus\n",
-			uv_num_possible_blades(),
-			num_possible_nodes(),
-			num_possible_cpus());
+	/* uv_num_possible_blades() is really the hub count: */
+	pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
 
-	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
-			    &sn_region_size, &system_serial_number);
+	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
 	hub_info.coherency_domain_number = sn_coherency_id;
 	uv_rtc_init();
 
@@ -1401,33 +1415,31 @@ void __init uv_system_init(void)
 		struct uv_hub_info_s *new_hub;
 
 		if (__uv_hub_info_list[nodeid]) {
-			pr_err("UV: Node %d UV HUB already initialized!?\n",
-				nodeid);
+			pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
 			BUG();
 		}
 
 		/* Allocate new per hub info list */
-		new_hub = (nodeid == 0) ?
-			&uv_hub_info_node0 :
-			kzalloc_node(bytes, GFP_KERNEL, nodeid);
+		new_hub = (nodeid == 0) ?  &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
 		BUG_ON(!new_hub);
 		__uv_hub_info_list[nodeid] = new_hub;
 		new_hub = uv_hub_info_list(nodeid);
 		BUG_ON(!new_hub);
 		*new_hub = hub_info;
 
-		/* Use information from GAM table if available */
+		/* Use information from GAM table if available: */
 		if (_node_to_pnode)
 			new_hub->pnode = _node_to_pnode[nodeid];
-		else	/* Fill in during cpu loop */
+		else /* Or fill in during CPU loop: */
 			new_hub->pnode = 0xffff;
+
 		new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
 		new_hub->memory_nid = -1;
 		new_hub->nr_possible_cpus = 0;
 		new_hub->nr_online_cpus = 0;
 	}
 
-	/* Initialize per cpu info */
+	/* Initialize per CPU info: */
 	for_each_possible_cpu(cpu) {
 		int apicid = per_cpu(x86_cpu_to_apicid, cpu);
 		int numa_node_id;
@@ -1438,22 +1450,24 @@ void __init uv_system_init(void)
 		pnode = uv_apicid_to_pnode(apicid);
 
 		uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
-		uv_cpu_info_per(cpu)->blade_cpu_id =
-			uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+		uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
 		if (uv_cpu_hub_info(cpu)->memory_nid == -1)
 			uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
-		if (nodeid != numa_node_id &&	/* init memoryless node */
+
+		/* Init memoryless node: */
+		if (nodeid != numa_node_id &&
 		    uv_hub_info_list(numa_node_id)->pnode == 0xffff)
 			uv_hub_info_list(numa_node_id)->pnode = pnode;
 		else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
 			uv_cpu_hub_info(cpu)->pnode = pnode;
+
 		uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
 	}
 
 	for_each_node(nodeid) {
 		unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
 
-		/* Add pnode info for pre-GAM list nodes without cpus */
+		/* Add pnode info for pre-GAM list nodes without CPUs: */
 		if (pnode == 0xffff) {
 			unsigned long paddr;
 
@@ -1479,15 +1493,30 @@ void __init uv_system_init(void)
 	uv_scir_register_cpu_notifier();
 	proc_mkdir("sgi_uv", NULL);
 
-	/* register Legacy VGA I/O redirection handler */
+	/* Register Legacy VGA I/O redirection handler: */
 	pci_register_set_vga_state(uv_set_vga_state);
 
 	/*
 	 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
-	 * EFI is not enabled in the kdump kernel.
+	 * EFI is not enabled in the kdump kernel:
 	 */
 	if (is_kdump_kernel())
 		reboot_type = BOOT_ACPI;
 }
 
+/*
+ * There is a small amount of UV specific code needed to initialize a
+ * UV system that does not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+	if (likely(!is_uv_system() && !is_uv_hubless()))
+		return;
+
+	if (is_uv_system())
+		uv_system_init_hub();
+	else
+		uv_nmi_setup_hubless();
+}
+
 apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 51287cd90bf6..5a414545e8a3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -218,7 +218,8 @@
 #include <linux/apm_bios.h>
 #include <linux/init.h>
 #include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
 #include <linux/pm.h>
 #include <linux/capability.h>
 #include <linux/device.h>
@@ -234,7 +235,7 @@
 #include <linux/i8253.h>
 #include <linux/cpuidle.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/desc.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
@@ -905,21 +906,21 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 {
 	static int use_apm_idle; /* = 0 */
 	static unsigned int last_jiffies; /* = 0 */
-	static unsigned int last_stime; /* = 0 */
-	cputime_t stime;
+	static u64 last_stime; /* = 0 */
+	u64 stime, utime;
 
 	int apm_idle_done = 0;
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
 	unsigned int bucket;
 
 recalc:
-	task_cputime(current, NULL, &stime);
+	task_cputime(current, &utime, &stime);
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;
 
-		idle_percentage = cputime_to_jiffies(stime - last_stime);
+		idle_percentage = nsecs_to_jiffies(stime - last_stime);
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index c62e015b126c..de827d6ac8c2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ void common(void) {
 
 	BLANK();
 	OFFSET(BP_scratch, boot_params, scratch);
+	OFFSET(BP_secure_boot, boot_params, secure_boot);
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 210927ee2e74..99332f550c48 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,6 +13,10 @@ static char syscalls_ia32[] = {
 #include <asm/syscalls_32.h>
 };
 
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
 int main(void)
 {
 #ifdef CONFIG_PARAVIRT
@@ -22,6 +26,11 @@ int main(void)
 	BLANK();
 #endif
 
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+	OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+	BLANK();
+#endif
+
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(cx);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f7d4ef..52000010c62e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,13 +20,11 @@ obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
+obj-y			+= bugs.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
 
-obj-$(CONFIG_X86_32)	+= bugs.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
-
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
@@ -34,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
+obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
 obj-$(CONFIG_MICROCODE)			+= microcode/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1e81a37c034e..c36140d788fe 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
@@ -20,6 +21,10 @@
 
 #include "cpu.h"
 
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
 /*
  * nodes_per_socket: Stores the number of nodes per socket.
  * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -308,17 +313,43 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		node_id = ecx & 7;
 
-		/* get compute unit information */
-		smp_num_siblings = ((ebx >> 8) & 3) + 1;
-		c->x86_max_cores /= smp_num_siblings;
-		c->cpu_core_id = ebx & 0xff;
+		node_id  = ecx & 0xff;
+		smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
+
+		if (c->x86 == 0x15)
+			c->cu_id = ebx & 0xff;
+
+		if (c->x86 >= 0x17) {
+			c->cpu_core_id = ebx & 0xff;
+
+			if (smp_num_siblings > 1)
+				c->x86_max_cores /= smp_num_siblings;
+		}
+
+		/*
+		 * We may have multiple LLCs if L3 caches exist, so check if we
+		 * have an L3 cache by looking at the L3 cache CPUID leaf.
+		 */
+		if (cpuid_edx(0x80000006)) {
+			if (c->x86 == 0x17) {
+				/*
+				 * LLC is at the core complex level.
+				 * Core complex id is ApicId[3].
+				 */
+				per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+			} else {
+				/* LLC is at the node level. */
+				per_cpu(cpu_llc_id, cpu) = node_id;
+			}
+		}
 	} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
 		node_id = value & 7;
+
+		per_cpu(cpu_llc_id, cpu) = node_id;
 	} else
 		return;
 
@@ -329,9 +360,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
 		cus_per_node = c->x86_max_cores / nodes_per_socket;
 
-		/* store NodeID, use llc_shared_map to store sibling info */
-		per_cpu(cpu_llc_id, cpu) = node_id;
-
 		/* core id has to be in the [0 .. cores_per_node - 1] range */
 		c->cpu_core_id %= cus_per_node;
 	}
@@ -356,15 +384,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
 	/* use socket ID also for last level cache */
 	per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
 	amd_get_topology(c);
-
-	/*
-	 * Fix percpu cpu_llc_id here as LLC topology is different
-	 * for Fam17h systems.
-	 */
-	 if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
-		return;
-
-	per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
 #endif
 }
 
@@ -537,8 +556,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
 	}
 
 	/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
@@ -585,11 +602,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 	/* F16h erratum 793, CVE-2013-6885 */
 	if (c->x86 == 0x16 && c->x86_model <= 0xf)
 		msr_set_bit(MSR_AMD64_LS_CFG, 15);
-}
 
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+	/*
+	 * Check whether the machine is affected by erratum 400. This is
+	 * used to select the proper idle routine and to enable the check
+	 * whether the machine is affected in arch_post_acpi_init(), which
+	 * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+	 */
+	if (cpu_has_amd_erratum(c, amd_erratum_400))
+		set_cpu_bug(c, X86_BUG_AMD_E400);
+}
 
 static void init_amd_k8(struct cpuinfo_x86 *c)
 {
@@ -770,9 +792,6 @@ static void init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
-	if (cpu_has_amd_erratum(c, amd_erratum_400))
-		set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
-
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
 
 	/* 3DNow or LM implies PREFETCHW */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..a44ef52184df 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,15 +16,19 @@
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
 
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
-#ifndef CONFIG_SMP
-	pr_info("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
 
+	if (!IS_ENABLED(CONFIG_SMP)) {
+		pr_info("CPU: ");
+		print_cpu_info(&boot_cpu_data);
+	}
+
+#ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
 	 *
@@ -40,4 +44,18 @@ void __init check_bugs(void)
 	alternative_instructions();
 
 	fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
+#endif
 }
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index a972ac4c7e7d..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
-	identify_boot_cpu();
-#if !defined(CONFIG_SMP)
-	pr_info("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
-	alternative_instructions();
-
-	/*
-	 * Make sure the first 2MB area is not mapped by huge pages
-	 * There are typically fixed size MTRRs in there and overlapping
-	 * MTRRs into large pages causes slow downs.
-	 *
-	 * Right now we don't do that with gbpages because there seems
-	 * very little benefit for that case.
-	 */
-	if (!direct_gbpages)
-		set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..43955ee6715b 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,6 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/cpufeature.h>
 #include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc9e980c68ec..58094a1f9e9d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,7 +7,9 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 #include <linux/kprobes.h>
 #include <linux/kgdb.h>
@@ -35,6 +37,7 @@
 #include <asm/desc.h>
 #include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
+#include <asm/hwcap2.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
 #include <asm/bugs.h>
@@ -51,6 +54,8 @@
 
 #include "cpu.h"
 
+u32 elf_hwcap2 __read_mostly;
+
 /* all of these masks are initialized in setup_cpu_local_masks() */
 cpumask_var_t cpu_initialized_mask;
 cpumask_var_t cpu_callout_mask;
@@ -655,6 +660,16 @@ void cpu_detect(struct cpuinfo_x86 *c)
 	}
 }
 
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 eax, ebx, ecx, edx;
@@ -667,13 +682,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_1_EDX] = edx;
 	}
 
+	/* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+	if (c->cpuid_level >= 0x00000006)
+		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
 	/* Additional Intel-defined flags: level 0x00000007 */
 	if (c->cpuid_level >= 0x00000007) {
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-
 		c->x86_capability[CPUID_7_0_EBX] = ebx;
-
-		c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
 		c->x86_capability[CPUID_7_ECX] = ecx;
 	}
 
@@ -747,6 +763,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
 
 	init_scattered_cpuid_features(c);
+
+	/*
+	 * Clear/Set all flags overridden by options, after probe.
+	 * This needs to happen each time we re-probe, which may happen
+	 * several times during CPU initialization.
+	 */
+	apply_forced_caps(c);
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -800,14 +823,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 	c->extended_cpuid_level = 0;
 
-	if (!have_cpuid_p())
-		identify_cpu_without_cpuid(c);
-
 	/* cyrix could have cpuid enabled via c_identify()*/
 	if (have_cpuid_p()) {
 		cpu_detect(c);
 		get_cpu_vendor(c);
 		get_cpu_cap(c);
+		setup_force_cpu_cap(X86_FEATURE_CPUID);
 
 		if (this_cpu->c_early_init)
 			this_cpu->c_early_init(c);
@@ -817,6 +838,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 		if (this_cpu->c_bsp_init)
 			this_cpu->c_bsp_init(c);
+	} else {
+		identify_cpu_without_cpuid(c);
+		setup_clear_cpu_cap(X86_FEATURE_CPUID);
 	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -979,29 +1003,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
 }
 
 /*
- * The physical to logical package id mapping is initialized from the
- * acpi/mptables information. Make sure that CPUID actually agrees with
- * that.
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
  */
-static void sanitize_package_id(struct cpuinfo_x86 *c)
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
-	unsigned int pkg, apicid, cpu = smp_processor_id();
+	unsigned int apicid, cpu = smp_processor_id();
 
 	apicid = apic->cpu_present_to_apicid(cpu);
-	pkg = apicid >> boot_cpu_data.x86_coreid_bits;
 
-	if (apicid != c->initial_apicid) {
-		pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n",
+	if (apicid != c->apicid) {
+		pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
 		       cpu, apicid, c->initial_apicid);
-		c->initial_apicid = apicid;
-	}
-	if (pkg != c->phys_proc_id) {
-		pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n",
-		       cpu, pkg, c->phys_proc_id);
-		c->phys_proc_id = pkg;
 	}
-	c->logical_proc_id = topology_phys_to_logical_pkg(pkg);
+	BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
 #else
 	c->logical_proc_id = 0;
 #endif
@@ -1022,6 +1038,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
 	c->x86_coreid_bits = 0;
+	c->cu_id = 0xff;
 #ifdef CONFIG_X86_64
 	c->x86_clflush_size = 64;
 	c->x86_phys_bits = 36;
@@ -1041,10 +1058,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 		this_cpu->c_identify(c);
 
 	/* Clear/Set all flags overridden by options, after probe */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+	apply_forced_caps(c);
 
 #ifdef CONFIG_X86_64
 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
@@ -1103,10 +1117,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 * Clear/Set all flags overridden by options, need do it
 	 * before following smp all cpus cap AND.
 	 */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+	apply_forced_caps(c);
 
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
@@ -1132,7 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #ifdef CONFIG_NUMA
 	numa_add_cpu(smp_processor_id());
 #endif
-	sanitize_package_id(c);
 }
 
 /*
@@ -1172,7 +1182,6 @@ void enable_sep_cpu(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
-	init_amd_e400_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
@@ -1188,53 +1197,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
 	enable_sep_cpu();
 #endif
 	mtrr_ap_init();
+	validate_apic_and_package_id(c);
 }
 
-struct msr_range {
-	unsigned	min;
-	unsigned	max;
-};
-
-static const struct msr_range msr_range_array[] = {
-	{ 0x00000000, 0x00000418},
-	{ 0xc0000000, 0xc000040b},
-	{ 0xc0010000, 0xc0010142},
-	{ 0xc0011000, 0xc001103b},
-};
-
-static void __print_cpu_msr(void)
-{
-	unsigned index_min, index_max;
-	unsigned index;
-	u64 val;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
-		index_min = msr_range_array[i].min;
-		index_max = msr_range_array[i].max;
-
-		for (index = index_min; index < index_max; index++) {
-			if (rdmsrl_safe(index, &val))
-				continue;
-			pr_info(" MSR%08x: %016llx\n", index, val);
-		}
-	}
-}
-
-static int show_msr;
-
-static __init int setup_show_msr(char *arg)
-{
-	int num;
-
-	get_option(&arg, &num);
-
-	if (num > 0)
-		show_msr = num;
-	return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
 static __init int setup_noclflush(char *arg)
 {
 	setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1268,21 +1233,13 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		pr_cont(", stepping: 0x%x)\n", c->x86_mask);
 	else
 		pr_cont(")\n");
-
-	print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
-	if (c->cpu_index < show_msr)
-		__print_cpu_msr();
 }
 
 static __init int setup_disablecpuid(char *arg)
 {
 	int bit;
 
-	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+	if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
 		setup_clear_cpu_cap(bit);
 	else
 		return 0;
@@ -1490,11 +1447,8 @@ void cpu_init(void)
 	 */
 	cr4_init_shadow();
 
-	/*
-	 * Load microcode on this cpu if a valid microcode is available.
-	 * This is early microcode loading procedure.
-	 */
-	load_ucode_ap();
+	if (cpu)
+		load_ucode_ap();
 
 	t = &per_cpu(cpu_tss, cpu);
 	oist = &per_cpu(orig_ist, cpu);
@@ -1555,7 +1509,7 @@ void cpu_init(void)
 	for (i = 0; i <= IO_BITMAP_LONGS; i++)
 		t->io_bitmap[i] = ~0UL;
 
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	me->active_mm = &init_mm;
 	BUG_ON(me->mm);
 	enter_lazy_tlb(&init_mm, me);
@@ -1606,7 +1560,7 @@ void cpu_init(void)
 	/*
 	 * Set up and load the per-CPU TSS and LDT
 	 */
-	atomic_inc(&init_mm.mm_count);
+	mmgrab(&init_mm);
 	curr->active_mm = &init_mm;
 	BUG_ON(curr->mm);
 	enter_lazy_tlb(&init_mm, curr);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..a70fd61095f8 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,8 @@
 #include <asm/pci-direct.h>
 #include <asm/tsc.h>
 #include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..063197771b8d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
 #include <linux/bitops.h>
 #include <linux/smp.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/thread_info.h>
 #include <linux/init.h>
 #include <linux/uaccess.h>
@@ -14,6 +15,9 @@
 #include <asm/bugs.h>
 #include <asm/cpu.h>
 #include <asm/intel-family.h>
+#include <asm/microcode_intel.h>
+#include <asm/hwcap2.h>
+#include <asm/elf.h>
 
 #ifdef CONFIG_X86_64
 #include <linux/topology.h>
@@ -61,6 +65,46 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
 	}
 }
 
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+	ring3mwait_disabled = true;
+	return 0;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
+
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+	/*
+	 * Ring 3 MONITOR/MWAIT feature cannot be detected without
+	 * cpu model and family comparison.
+	 */
+	if (c->x86 != 6)
+		return;
+	switch (c->x86_model) {
+	case INTEL_FAM6_XEON_PHI_KNL:
+	case INTEL_FAM6_XEON_PHI_KNM:
+		break;
+	default:
+		return;
+	}
+
+	if (ring3mwait_disabled) {
+		msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
+			      MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+		return;
+	}
+
+	msr_set_bit(MSR_MISC_FEATURE_ENABLES,
+		    MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+
+	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+
+	if (c == &boot_cpu_data)
+		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
 static void early_init_intel(struct cpuinfo_x86 *c)
 {
 	u64 misc_enable;
@@ -78,14 +122,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 
-	if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
-		unsigned lower_word;
-
-		wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-		/* Required by the SDM */
-		sync_core();
-		rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
-	}
+	if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+		c->microcode = intel_get_microcode_revision();
 
 	/*
 	 * Atom erratum AAE44/AAF40/AAG38/AAH41:
@@ -124,8 +162,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86_power & (1 << 8)) {
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
-		if (!check_tsc_unstable())
-			set_sched_clock_stable();
 	}
 
 	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -565,6 +601,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 		detect_vmx_virtcap(c);
 
 	init_intel_energy_perf(c);
+
+	probe_xeon_phi_r3mwait(c);
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index de6626c18e42..c55fb2cb2acc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
 #include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
 
@@ -153,6 +154,7 @@ struct _cpuid4_info_regs {
 	union _cpuid4_leaf_eax eax;
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
+	unsigned int id;
 	unsigned long size;
 	struct amd_northbridge *nb;
 };
@@ -894,6 +896,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
 static void ci_leaf_init(struct cacheinfo *this_leaf,
 			 struct _cpuid4_info_regs *base)
 {
+	this_leaf->id = base->id;
+	this_leaf->attributes = CACHE_ID;
 	this_leaf->level = base->eax.split.level;
 	this_leaf->type = cache_type_map[base->eax.split.type];
 	this_leaf->coherency_line_size =
@@ -920,6 +924,22 @@ static int __init_cache_level(unsigned int cpu)
 	return 0;
 }
 
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned long num_threads_sharing;
+	int index_msb;
+
+	num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+	index_msb = get_count_order(num_threads_sharing);
+	id4_regs->id = c->apicid >> index_msb;
+}
+
 static int __populate_cache_leaves(unsigned int cpu)
 {
 	unsigned int idx, ret;
@@ -931,9 +951,12 @@ static int __populate_cache_leaves(unsigned int cpu)
 		ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
 		if (ret)
 			return ret;
+		get_cache_id(cpu, &id4_regs);
 		ci_leaf_init(this_leaf++, &id4_regs);
 		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}
+	this_cpu_ci->cpu_map_populated = true;
+
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000000..5a533fefefa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,403 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *    Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+	{
+		.name		= "L3",
+		.domains	= domain_init(RDT_RESOURCE_L3),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 1,
+		.cbm_idx_offset	= 0
+	},
+	{
+		.name		= "L3DATA",
+		.domains	= domain_init(RDT_RESOURCE_L3DATA),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 2,
+		.cbm_idx_offset	= 0
+	},
+	{
+		.name		= "L3CODE",
+		.domains	= domain_init(RDT_RESOURCE_L3CODE),
+		.msr_base	= IA32_L3_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 3,
+		.cbm_idx_multi	= 2,
+		.cbm_idx_offset	= 1
+	},
+	{
+		.name		= "L2",
+		.domains	= domain_init(RDT_RESOURCE_L2),
+		.msr_base	= IA32_L2_CBM_BASE,
+		.min_cbm_bits	= 1,
+		.cache_level	= 2,
+		.cbm_idx_multi	= 1,
+		.cbm_idx_offset	= 0
+	},
+};
+
+static int cbm_idx(struct rdt_resource *r, int closid)
+{
+	return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ *	Intel(R) Xeon(R)  CPU E5-2658  v3  @  2.20GHz
+ *	Intel(R) Xeon(R)  CPU E5-2648L v3  @  1.80GHz
+ *	Intel(R) Xeon(R)  CPU E5-2628L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2618L v3  @  2.30GHz
+ *	Intel(R) Xeon(R)  CPU E5-2608L v3  @  2.00GHz
+ *	Intel(R) Xeon(R)  CPU E5-2658A v3  @  2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    boot_cpu_data.x86 == 6 &&
+	    boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
+		struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
+		u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+		if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+			return false;
+		rdmsr(IA32_L3_CBM_BASE, l, h);
+
+		/* If all the bits were set in MSR, return success */
+		if (l != max_cbm)
+			return false;
+
+		r->num_closid = 4;
+		r->cbm_len = 20;
+		r->max_cbm = max_cbm;
+		r->min_cbm_bits = 2;
+		r->capable = true;
+		r->enabled = true;
+
+		return true;
+	}
+
+	return false;
+}
+
+static void rdt_get_config(int idx, struct rdt_resource *r)
+{
+	union cpuid_0x10_1_eax eax;
+	union cpuid_0x10_1_edx edx;
+	u32 ebx, ecx;
+
+	cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+	r->num_closid = edx.split.cos_max + 1;
+	r->cbm_len = eax.split.cbm_len + 1;
+	r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+	r->capable = true;
+	r->enabled = true;
+}
+
+static void rdt_get_cdp_l3_config(int type)
+{
+	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+	struct rdt_resource *r = &rdt_resources_all[type];
+
+	r->num_closid = r_l3->num_closid / 2;
+	r->cbm_len = r_l3->cbm_len;
+	r->max_cbm = r_l3->max_cbm;
+	r->capable = true;
+	/*
+	 * By default, CDP is disabled. CDP can be enabled by mount parameter
+	 * "cdp" during resctrl file system mount time.
+	 */
+	r->enabled = false;
+}
+
+static inline bool get_rdt_resources(void)
+{
+	bool ret = false;
+
+	if (cache_alloc_hsw_probe())
+		return true;
+
+	if (!boot_cpu_has(X86_FEATURE_RDT_A))
+		return false;
+
+	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+		rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+		}
+		ret = true;
+	}
+	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+		/* CPUID 0x10.2 fields are same format at 0x10.1 */
+		rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+		ret = true;
+	}
+
+	return ret;
+}
+
+static int get_cache_id(int cpu, int level)
+{
+	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+	int i;
+
+	for (i = 0; i < ci->num_leaves; i++) {
+		if (ci->info_list[i].level == level)
+			return ci->info_list[i].id;
+	}
+
+	return -1;
+}
+
+void rdt_cbm_update(void *arg)
+{
+	struct msr_param *m = (struct msr_param *)arg;
+	struct rdt_resource *r = m->res;
+	int i, cpu = smp_processor_id();
+	struct rdt_domain *d;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Find the domain that contains this CPU */
+		if (cpumask_test_cpu(cpu, &d->cpu_mask))
+			goto found;
+	}
+	pr_info_once("cpu %d not found in any domain for resource %s\n",
+		     cpu, r->name);
+
+	return;
+
+found:
+	for (i = m->low; i < m->high; i++) {
+		int idx = cbm_idx(r, i);
+
+		wrmsrl(r->msr_base + idx, d->cbm[i]);
+	}
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+					  struct list_head **pos)
+{
+	struct rdt_domain *d;
+	struct list_head *l;
+
+	if (id < 0)
+		return ERR_PTR(id);
+
+	list_for_each(l, &r->domains) {
+		d = list_entry(l, struct rdt_domain, list);
+		/* When id is found, return its domain. */
+		if (id == d->id)
+			return d;
+		/* Stop searching when finding id's position in sorted list. */
+		if (id < d->id)
+			break;
+	}
+
+	if (pos)
+		*pos = l;
+
+	return NULL;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+	int i, id = get_cache_id(cpu, r->cache_level);
+	struct list_head *add_pos = NULL;
+	struct rdt_domain *d;
+
+	d = rdt_find_domain(r, id, &add_pos);
+	if (IS_ERR(d)) {
+		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		return;
+	}
+
+	if (d) {
+		cpumask_set_cpu(cpu, &d->cpu_mask);
+		return;
+	}
+
+	d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+	if (!d)
+		return;
+
+	d->id = id;
+
+	d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
+	if (!d->cbm) {
+		kfree(d);
+		return;
+	}
+
+	for (i = 0; i < r->num_closid; i++) {
+		int idx = cbm_idx(r, i);
+
+		d->cbm[i] = r->max_cbm;
+		wrmsrl(r->msr_base + idx, d->cbm[i]);
+	}
+
+	cpumask_set_cpu(cpu, &d->cpu_mask);
+	list_add_tail(&d->list, add_pos);
+	r->num_domains++;
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+	int id = get_cache_id(cpu, r->cache_level);
+	struct rdt_domain *d;
+
+	d = rdt_find_domain(r, id, NULL);
+	if (IS_ERR_OR_NULL(d)) {
+		pr_warn("Could't find cache id for cpu %d\n", cpu);
+		return;
+	}
+
+	cpumask_clear_cpu(cpu, &d->cpu_mask);
+	if (cpumask_empty(&d->cpu_mask)) {
+		r->num_domains--;
+		kfree(d->cbm);
+		list_del(&d->list);
+		kfree(d);
+	}
+}
+
+static void clear_closid(int cpu)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+	per_cpu(cpu_closid, cpu) = 0;
+	state->closid = 0;
+	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+	for_each_capable_rdt_resource(r)
+		domain_add_cpu(cpu, r);
+	/* The cpu is set in default rdtgroup after online. */
+	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+	clear_closid(cpu);
+	mutex_unlock(&rdtgroup_mutex);
+
+	return 0;
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+	for_each_capable_rdt_resource(r)
+		domain_remove_cpu(cpu, r);
+	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+			break;
+	}
+	clear_closid(cpu);
+	mutex_unlock(&rdtgroup_mutex);
+
+	return 0;
+}
+
+static int __init intel_rdt_late_init(void)
+{
+	struct rdt_resource *r;
+	int state, ret;
+
+	if (!get_rdt_resources())
+		return -ENODEV;
+
+	state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+				  "x86/rdt/cat:online:",
+				  intel_rdt_online_cpu, intel_rdt_offline_cpu);
+	if (state < 0)
+		return state;
+
+	ret = rdtgroup_init();
+	if (ret) {
+		cpuhp_remove_state(state);
+		return ret;
+	}
+
+	for_each_capable_rdt_resource(r)
+		pr_info("Intel RDT %s allocation detected\n", r->name);
+
+	return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000000..9ac2a5cdd9c2
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,1115 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_common.h>
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ *   group.
+ * + Context switch code can avoid extra memory references deciding which
+ *   CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ *   systems.
+ * - Our choices on how to configure each resource become progressively more
+ *   limited as the number of resources grows.
+ */
+static int closid_free_map;
+
+static void closid_init(void)
+{
+	struct rdt_resource *r;
+	int rdt_min_closid = 32;
+
+	/* Compute rdt_min_closid across all resources */
+	for_each_enabled_rdt_resource(r)
+		rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+	/* CLOSID 0 is always reserved for the default group */
+	closid_free_map &= ~1;
+}
+
+int closid_alloc(void)
+{
+	int closid = ffs(closid_free_map);
+
+	if (closid == 0)
+		return -ENOSPC;
+	closid--;
+	closid_free_map &= ~(1 << closid);
+
+	return closid;
+}
+
+static void closid_free(int closid)
+{
+	closid_free_map |= 1 << closid;
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+				.ia_uid = current_fsuid(),
+				.ia_gid = current_fsgid(), };
+
+	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+		return 0;
+
+	return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+	struct kernfs_node *kn;
+	int ret;
+
+	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+				  0, rft->kf_ops, rft, NULL, NULL);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret) {
+		kernfs_remove(kn);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
+			      int len)
+{
+	struct rftype *rft;
+	int ret;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	for (rft = rfts; rft < rfts + len; rft++) {
+		ret = rdtgroup_add_file(kn, rft);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+error:
+	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+	while (--rft >= rfts)
+		kernfs_remove_by_name(kn, rft->name);
+	return ret;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+	struct kernfs_open_file *of = m->private;
+	struct rftype *rft = of->kn->priv;
+
+	if (rft->seq_show)
+		return rft->seq_show(of, m, arg);
+	return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+				   size_t nbytes, loff_t off)
+{
+	struct rftype *rft = of->kn->priv;
+
+	if (rft->write)
+		return rft->write(of, buf, nbytes, off);
+
+	return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.write			= rdtgroup_file_write,
+	.seq_show		= rdtgroup_seqfile_show,
+};
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+			      struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	if (rdtgrp)
+		seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from rdt_update_closid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void rdt_update_cpu_closid(void *closid)
+{
+	if (closid)
+		this_cpu_write(cpu_closid, *(int *)closid);
+	/*
+	 * We cannot unconditionally write the MSR because the current
+	 * executing task might have its own closid selected. Just reuse
+	 * the context switch code.
+	 */
+	intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids must have been set up before calling this function.
+ *
+ * The per cpu closids are updated with the smp function call, when @closid
+ * is not NULL. If @closid is NULL then all affected percpu closids must
+ * have been set up before calling this function.
+ */
+static void
+rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+{
+	int cpu = get_cpu();
+
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_update_cpu_closid(closid);
+	smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+	put_cpu();
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+				   char *buf, size_t nbytes, loff_t off)
+{
+	cpumask_var_t tmpmask, newmask;
+	struct rdtgroup *rdtgrp, *r;
+	int ret;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+		free_cpumask_var(tmpmask);
+		return -ENOMEM;
+	}
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+
+	ret = cpumask_parse(buf, newmask);
+	if (ret)
+		goto unlock;
+
+	/* check that user didn't specify any offline cpus */
+	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+	if (cpumask_weight(tmpmask)) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	/* Check whether cpus are dropped from this group */
+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+	if (cpumask_weight(tmpmask)) {
+		/* Can't drop from default group */
+		if (rdtgrp == &rdtgroup_default) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+		/* Give any dropped cpus to rdtgroup_default */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, tmpmask);
+		rdt_update_closid(tmpmask, &rdtgroup_default.closid);
+	}
+
+	/*
+	 * If we added cpus, remove them from previous group that owned them
+	 * and update per-cpu closid
+	 */
+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+	if (cpumask_weight(tmpmask)) {
+		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+			if (r == rdtgrp)
+				continue;
+			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+		}
+		rdt_update_closid(tmpmask, &rdtgrp->closid);
+	}
+
+	/* Done pushing/pulling - update this group with new mask */
+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+unlock:
+	rdtgroup_kn_unlock(of->kn);
+	free_cpumask_var(tmpmask);
+	free_cpumask_var(newmask);
+
+	return ret ?: nbytes;
+}
+
+struct task_move_callback {
+	struct callback_head	work;
+	struct rdtgroup		*rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+	struct task_move_callback *callback;
+	struct rdtgroup *rdtgrp;
+
+	callback = container_of(head, struct task_move_callback, work);
+	rdtgrp = callback->rdtgrp;
+
+	/*
+	 * If resource group was deleted before this task work callback
+	 * was invoked, then assign the task to root group and free the
+	 * resource group.
+	 */
+	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+	    (rdtgrp->flags & RDT_DELETED)) {
+		current->closid = 0;
+		kfree(rdtgrp);
+	}
+
+	preempt_disable();
+	/* update PQR_ASSOC MSR to make resource group go into effect */
+	intel_rdt_sched_in();
+	preempt_enable();
+
+	kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+				struct rdtgroup *rdtgrp)
+{
+	struct task_move_callback *callback;
+	int ret;
+
+	callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+	if (!callback)
+		return -ENOMEM;
+	callback->work.func = move_myself;
+	callback->rdtgrp = rdtgrp;
+
+	/*
+	 * Take a refcount, so rdtgrp cannot be freed before the
+	 * callback has been invoked.
+	 */
+	atomic_inc(&rdtgrp->waitcount);
+	ret = task_work_add(tsk, &callback->work, true);
+	if (ret) {
+		/*
+		 * Task is exiting. Drop the refcount and free the callback.
+		 * No need to check the refcount as the group cannot be
+		 * deleted before the write function unlocks rdtgroup_mutex.
+		 */
+		atomic_dec(&rdtgrp->waitcount);
+		kfree(callback);
+	} else {
+		tsk->closid = rdtgrp->closid;
+	}
+	return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+					  struct kernfs_open_file *of)
+{
+	const struct cred *tcred = get_task_cred(task);
+	const struct cred *cred = current_cred();
+	int ret = 0;
+
+	/*
+	 * Even if we're attaching all tasks in the thread group, we only
+	 * need to check permissions on one of them.
+	 */
+	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+	    !uid_eq(cred->euid, tcred->uid) &&
+	    !uid_eq(cred->euid, tcred->suid))
+		ret = -EPERM;
+
+	put_cred(tcred);
+	return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+			      struct kernfs_open_file *of)
+{
+	struct task_struct *tsk;
+	int ret;
+
+	rcu_read_lock();
+	if (pid) {
+		tsk = find_task_by_vpid(pid);
+		if (!tsk) {
+			rcu_read_unlock();
+			return -ESRCH;
+		}
+	} else {
+		tsk = current;
+	}
+
+	get_task_struct(tsk);
+	rcu_read_unlock();
+
+	ret = rdtgroup_task_write_permission(tsk, of);
+	if (!ret)
+		ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+	put_task_struct(tsk);
+	return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+				    char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+	pid_t pid;
+
+	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+		return -EINVAL;
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+	if (rdtgrp)
+		ret = rdtgroup_move_task(pid, rdtgrp, of);
+	else
+		ret = -ENOENT;
+
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+	struct task_struct *p, *t;
+
+	rcu_read_lock();
+	for_each_process_thread(p, t) {
+		if (t->closid == r->closid)
+			seq_printf(s, "%d\n", t->pid);
+	}
+	rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+			       struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	int ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp)
+		show_rdt_tasks(rdtgrp, s);
+	else
+		ret = -ENOENT;
+	rdtgroup_kn_unlock(of->kn);
+
+	return ret;
+}
+
+/* Files in each rdtgroup */
+static struct rftype rdtgroup_base_files[] = {
+	{
+		.name		= "cpus",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_cpus_write,
+		.seq_show	= rdtgroup_cpus_show,
+	},
+	{
+		.name		= "tasks",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_tasks_write,
+		.seq_show	= rdtgroup_tasks_show,
+	},
+	{
+		.name		= "schemata",
+		.mode		= 0644,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.write		= rdtgroup_schemata_write,
+		.seq_show	= rdtgroup_schemata_show,
+	},
+};
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+				struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->num_closid);
+
+	return 0;
+}
+
+static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%x\n", r->max_cbm);
+
+	return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+			     struct seq_file *seq, void *v)
+{
+	struct rdt_resource *r = of->kn->parent->priv;
+
+	seq_printf(seq, "%d\n", r->min_cbm_bits);
+
+	return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_info_files[] = {
+	{
+		.name		= "num_closids",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_num_closids_show,
+	},
+	{
+		.name		= "cbm_mask",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_cbm_mask_show,
+	},
+	{
+		.name		= "min_cbm_bits",
+		.mode		= 0444,
+		.kf_ops		= &rdtgroup_kf_single_ops,
+		.seq_show	= rdt_min_cbm_bits_show,
+	},
+};
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+	struct kernfs_node *kn_subdir;
+	struct rdt_resource *r;
+	int ret;
+
+	/* create the directory */
+	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+	if (IS_ERR(kn_info))
+		return PTR_ERR(kn_info);
+	kernfs_get(kn_info);
+
+	for_each_enabled_rdt_resource(r) {
+		kn_subdir = kernfs_create_dir(kn_info, r->name,
+					      kn_info->mode, r);
+		if (IS_ERR(kn_subdir)) {
+			ret = PTR_ERR(kn_subdir);
+			goto out_destroy;
+		}
+		kernfs_get(kn_subdir);
+		ret = rdtgroup_kn_set_ugid(kn_subdir);
+		if (ret)
+			goto out_destroy;
+		ret = rdtgroup_add_files(kn_subdir, res_info_files,
+					 ARRAY_SIZE(res_info_files));
+		if (ret)
+			goto out_destroy;
+		kernfs_activate(kn_subdir);
+	}
+
+	/*
+	 * This extra ref will be put in kernfs_remove() and guarantees
+	 * that @rdtgrp->kn is always accessible.
+	 */
+	kernfs_get(kn_info);
+
+	ret = rdtgroup_kn_set_ugid(kn_info);
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn_info);
+
+	return 0;
+
+out_destroy:
+	kernfs_remove(kn_info);
+	return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+	bool *enable = arg;
+
+	wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+{
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	list_for_each_entry(d, &r->domains, list) {
+		/* Pick one CPU from each domain instance to update MSR */
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+	}
+	cpu = get_cpu();
+	/* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		l3_qos_cfg_update(&enable);
+	/* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+	smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+static int cdp_enable(void)
+{
+	struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+	struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+	int ret;
+
+	if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+		return -EINVAL;
+
+	ret = set_l3_qos_cfg(r_l3, true);
+	if (!ret) {
+		r_l3->enabled = false;
+		r_l3data->enabled = true;
+		r_l3code->enabled = true;
+	}
+	return ret;
+}
+
+static void cdp_disable(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+	r->enabled = r->capable;
+
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+		rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+		rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+		set_l3_qos_cfg(r, false);
+	}
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+	char *token, *o = data;
+	int ret = 0;
+
+	while ((token = strsep(&o, ",")) != NULL) {
+		if (!*token)
+			return -EINVAL;
+
+		if (!strcmp(token, "cdp"))
+			ret = cdp_enable();
+	}
+
+	return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+	if (kernfs_type(kn) == KERNFS_DIR) {
+		/*
+		 * All the resource directories use "kn->priv"
+		 * to point to the "struct rdtgroup" for the
+		 * resource. "info" and its subdirectories don't
+		 * have rdtgroup structures, so return NULL here.
+		 */
+		if (kn == kn_info || kn->parent == kn_info)
+			return NULL;
+		else
+			return kn->priv;
+	} else {
+		return kn->parent->priv;
+	}
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+	if (!rdtgrp)
+		return NULL;
+
+	atomic_inc(&rdtgrp->waitcount);
+	kernfs_break_active_protection(kn);
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/* Was this group deleted while we waited? */
+	if (rdtgrp->flags & RDT_DELETED)
+		return NULL;
+
+	return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+	if (!rdtgrp)
+		return;
+
+	mutex_unlock(&rdtgroup_mutex);
+
+	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+	    (rdtgrp->flags & RDT_DELETED)) {
+		kernfs_unbreak_active_protection(kn);
+		kernfs_put(rdtgrp->kn);
+		kfree(rdtgrp);
+	} else {
+		kernfs_unbreak_active_protection(kn);
+	}
+}
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+				int flags, const char *unused_dev_name,
+				void *data)
+{
+	struct dentry *dentry;
+	int ret;
+
+	mutex_lock(&rdtgroup_mutex);
+	/*
+	 * resctrl file system can only be mounted once.
+	 */
+	if (static_branch_unlikely(&rdt_enable_key)) {
+		dentry = ERR_PTR(-EBUSY);
+		goto out;
+	}
+
+	ret = parse_rdtgroupfs_options(data);
+	if (ret) {
+		dentry = ERR_PTR(ret);
+		goto out_cdp;
+	}
+
+	closid_init();
+
+	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+	if (ret) {
+		dentry = ERR_PTR(ret);
+		goto out_cdp;
+	}
+
+	dentry = kernfs_mount(fs_type, flags, rdt_root,
+			      RDTGROUP_SUPER_MAGIC, NULL);
+	if (IS_ERR(dentry))
+		goto out_cdp;
+
+	static_branch_enable(&rdt_enable_key);
+	goto out;
+
+out_cdp:
+	cdp_disable();
+out:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return dentry;
+}
+
+static int reset_all_cbms(struct rdt_resource *r)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int i, cpu;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.res = r;
+	msr_param.low = 0;
+	msr_param.high = r->num_closid;
+
+	/*
+	 * Disable resource control for this resource by setting all
+	 * CBMs in all domains to the maximum mask value. Pick one CPU
+	 * from each domain to update the MSRs below.
+	 */
+	list_for_each_entry(d, &r->domains, list) {
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+		for (i = 0; i < r->num_closid; i++)
+			d->cbm[i] = r->max_cbm;
+	}
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_cbm_update(&msr_param);
+	/* Update CBM on all other cpus in cpu_mask. */
+	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+				 struct cpumask *mask)
+{
+	struct task_struct *p, *t;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(p, t) {
+		if (!from || t->closid == from->closid) {
+			t->closid = to->closid;
+#ifdef CONFIG_SMP
+			/*
+			 * This is safe on x86 w/o barriers as the ordering
+			 * of writing to task_cpu() and t->on_cpu is
+			 * reverse to the reading here. The detection is
+			 * inaccurate as tasks might move or schedule
+			 * before the smp function call takes place. In
+			 * such a case the function call is pointless, but
+			 * there is no other side effect.
+			 */
+			if (mask && t->on_cpu)
+				cpumask_set_cpu(task_cpu(t), mask);
+#endif
+		}
+	}
+	read_unlock(&tasklist_lock);
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+	struct rdtgroup *rdtgrp, *tmp;
+
+	/* Move all tasks to the default resource group */
+	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+		/* Remove each rdtgroup other than root */
+		if (rdtgrp == &rdtgroup_default)
+			continue;
+
+		/*
+		 * Give any CPUs back to the default group. We cannot copy
+		 * cpu_online_mask because a CPU might have executed the
+		 * offline callback already, but is still marked online.
+		 */
+		cpumask_or(&rdtgroup_default.cpu_mask,
+			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+		kernfs_remove(rdtgrp->kn);
+		list_del(&rdtgrp->rdtgroup_list);
+		kfree(rdtgrp);
+	}
+	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+	get_online_cpus();
+	rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+	put_online_cpus();
+
+	kernfs_remove(kn_info);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+	struct rdt_resource *r;
+
+	mutex_lock(&rdtgroup_mutex);
+
+	/*Put everything back to default values. */
+	for_each_enabled_rdt_resource(r)
+		reset_all_cbms(r);
+	cdp_disable();
+	rmdir_all_sub();
+	static_branch_disable(&rdt_enable_key);
+	kernfs_kill_sb(sb);
+	mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct file_system_type rdt_fs_type = {
+	.name    = "resctrl",
+	.mount   = rdt_mount,
+	.kill_sb = rdt_kill_sb,
+};
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			  umode_t mode)
+{
+	struct rdtgroup *parent, *rdtgrp;
+	struct kernfs_node *kn;
+	int ret, closid;
+
+	/* Only allow mkdir in the root directory */
+	if (parent_kn != rdtgroup_default.kn)
+		return -EPERM;
+
+	/* Do not accept '\n' to avoid unparsable situation. */
+	if (strchr(name, '\n'))
+		return -EINVAL;
+
+	parent = rdtgroup_kn_lock_live(parent_kn);
+	if (!parent) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	ret = closid_alloc();
+	if (ret < 0)
+		goto out_unlock;
+	closid = ret;
+
+	/* allocate the rdtgroup. */
+	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+	if (!rdtgrp) {
+		ret = -ENOSPC;
+		goto out_closid_free;
+	}
+	rdtgrp->closid = closid;
+	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+	/* kernfs creates the directory for rdtgrp */
+	kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+	if (IS_ERR(kn)) {
+		ret = PTR_ERR(kn);
+		goto out_cancel_ref;
+	}
+	rdtgrp->kn = kn;
+
+	/*
+	 * kernfs_remove() will drop the reference count on "kn" which
+	 * will free it. But we still need it to stick around for the
+	 * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+	 * here, which will be dropped inside rdtgroup_kn_unlock().
+	 */
+	kernfs_get(kn);
+
+	ret = rdtgroup_kn_set_ugid(kn);
+	if (ret)
+		goto out_destroy;
+
+	ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+				 ARRAY_SIZE(rdtgroup_base_files));
+	if (ret)
+		goto out_destroy;
+
+	kernfs_activate(kn);
+
+	ret = 0;
+	goto out_unlock;
+
+out_destroy:
+	kernfs_remove(rdtgrp->kn);
+out_cancel_ref:
+	list_del(&rdtgrp->rdtgroup_list);
+	kfree(rdtgrp);
+out_closid_free:
+	closid_free(closid);
+out_unlock:
+	rdtgroup_kn_unlock(parent_kn);
+	return ret;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+	int ret, cpu, closid = rdtgroup_default.closid;
+	struct rdtgroup *rdtgrp;
+	cpumask_var_t tmpmask;
+
+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	rdtgrp = rdtgroup_kn_lock_live(kn);
+	if (!rdtgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	/* Give any tasks back to the default group */
+	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+	/* Give any CPUs back to the default group */
+	cpumask_or(&rdtgroup_default.cpu_mask,
+		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+	/* Update per cpu closid of the moved CPUs first */
+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+		per_cpu(cpu_closid, cpu) = closid;
+	/*
+	 * Update the MSR on moved CPUs and CPUs which have moved
+	 * task running on them.
+	 */
+	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+	rdt_update_closid(tmpmask, NULL);
+
+	rdtgrp->flags = RDT_DELETED;
+	closid_free(rdtgrp->closid);
+	list_del(&rdtgrp->rdtgroup_list);
+
+	/*
+	 * one extra hold on this, will drop when we kfree(rdtgrp)
+	 * in rdtgroup_kn_unlock()
+	 */
+	kernfs_get(kn);
+	kernfs_remove(rdtgrp->kn);
+	ret = 0;
+out:
+	rdtgroup_kn_unlock(kn);
+	free_cpumask_var(tmpmask);
+	return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+		seq_puts(seq, ",cdp");
+	return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+	.mkdir		= rdtgroup_mkdir,
+	.rmdir		= rdtgroup_rmdir,
+	.show_options	= rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+	int ret;
+
+	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+				      KERNFS_ROOT_CREATE_DEACTIVATED,
+				      &rdtgroup_default);
+	if (IS_ERR(rdt_root))
+		return PTR_ERR(rdt_root);
+
+	mutex_lock(&rdtgroup_mutex);
+
+	rdtgroup_default.closid = 0;
+	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+	ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+				 ARRAY_SIZE(rdtgroup_base_files));
+	if (ret) {
+		kernfs_destroy_root(rdt_root);
+		goto out;
+	}
+
+	rdtgroup_default.kn = rdt_root->kn;
+	kernfs_activate(rdtgroup_default.kn);
+
+out:
+	mutex_unlock(&rdtgroup_mutex);
+
+	return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+	int ret = 0;
+
+	ret = rdtgroup_setup_root();
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+	if (ret)
+		goto cleanup_root;
+
+	ret = register_filesystem(&rdt_fs_type);
+	if (ret)
+		goto cleanup_mountpoint;
+
+	return 0;
+
+cleanup_mountpoint:
+	sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+	kernfs_destroy_root(rdt_root);
+
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
new file mode 100644
index 000000000000..badd2b31a560
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -0,0 +1,245 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ *    Fenghua Yu <fenghua.yu@intel.com>
+ *    Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ *	Please note that all (and only) contiguous '1' combinations
+ *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+	unsigned long first_bit, zero_bit;
+
+	if (var == 0 || var > r->max_cbm)
+		return false;
+
+	first_bit = find_first_bit(&var, r->cbm_len);
+	zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+
+	if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+		return false;
+
+	if ((zero_bit - first_bit) < r->min_cbm_bits)
+		return false;
+	return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+	unsigned long data;
+	int ret;
+
+	ret = kstrtoul(buf, 16, &data);
+	if (ret)
+		return ret;
+	if (!cbm_validate(data, r))
+		return -EINVAL;
+	r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+	return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ *	id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+	char *dom = NULL, *id;
+	struct rdt_domain *d;
+	unsigned long dom_id;
+
+	list_for_each_entry(d, &r->domains, list) {
+		dom = strsep(&line, ";");
+		if (!dom)
+			return -EINVAL;
+		id = strsep(&dom, "=");
+		if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+			return -EINVAL;
+		if (parse_cbm(dom, r))
+			return -EINVAL;
+	}
+
+	/* Any garbage at the end of the line? */
+	if (line && line[0])
+		return -EINVAL;
+	return 0;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+	struct msr_param msr_param;
+	cpumask_var_t cpu_mask;
+	struct rdt_domain *d;
+	int cpu, idx = 0;
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	msr_param.low = closid;
+	msr_param.high = msr_param.low + 1;
+	msr_param.res = r;
+
+	list_for_each_entry(d, &r->domains, list) {
+		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+		d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+	}
+	cpu = get_cpu();
+	/* Update CBM on this cpu if it's in cpu_mask. */
+	if (cpumask_test_cpu(cpu, cpu_mask))
+		rdt_cbm_update(&msr_param);
+	/* Update CBM on other cpus. */
+	smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+	put_cpu();
+
+	free_cpumask_var(cpu_mask);
+
+	return 0;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	char *tok, *resname;
+	int closid, ret = 0;
+	u32 *l3_cbms = NULL;
+
+	/* Valid input requires a trailing newline */
+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
+		return -EINVAL;
+	buf[nbytes - 1] = '\0';
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		rdtgroup_kn_unlock(of->kn);
+		return -ENOENT;
+	}
+
+	closid = rdtgrp->closid;
+
+	/* get scratch space to save all the masks while we validate input */
+	for_each_enabled_rdt_resource(r) {
+		r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+				      GFP_KERNEL);
+		if (!r->tmp_cbms) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		r->num_tmp_cbms = 0;
+	}
+
+	while ((tok = strsep(&buf, "\n")) != NULL) {
+		resname = strsep(&tok, ":");
+		if (!tok) {
+			ret = -EINVAL;
+			goto out;
+		}
+		for_each_enabled_rdt_resource(r) {
+			if (!strcmp(resname, r->name) &&
+			    closid < r->num_closid) {
+				ret = parse_line(tok, r);
+				if (ret)
+					goto out;
+				break;
+			}
+		}
+		if (!r->name) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	/* Did the parser find all the masks we need? */
+	for_each_enabled_rdt_resource(r) {
+		if (r->num_tmp_cbms != r->num_domains) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for_each_enabled_rdt_resource(r) {
+		ret = update_domains(r, closid);
+		if (ret)
+			goto out;
+	}
+
+out:
+	for_each_enabled_rdt_resource(r) {
+		kfree(r->tmp_cbms);
+		r->tmp_cbms = NULL;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+	struct rdt_domain *dom;
+	bool sep = false;
+
+	seq_printf(s, "%s:", r->name);
+	list_for_each_entry(dom, &r->domains, list) {
+		if (sep)
+			seq_puts(s, ";");
+		seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+		sep = true;
+	}
+	seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+			   struct seq_file *s, void *v)
+{
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	int closid, ret = 0;
+
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (rdtgrp) {
+		closid = rdtgrp->closid;
+		for_each_enabled_rdt_resource(r) {
+			if (closid < r->num_closid)
+				show_doms(s, r, closid);
+		}
+	} else {
+		ret = -ENOENT;
+	}
+	rdtgroup_kn_unlock(of->kn);
+	return ret;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 83f1a98d37db..2eee85379689 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 
 	if (severity >= GHES_SEV_RECOVERABLE)
 		m.status |= MCI_STATUS_UC;
-	if (severity >= GHES_SEV_PANIC)
+
+	if (severity >= GHES_SEV_PANIC) {
 		m.status |= MCI_STATUS_PCC;
+		m.tsc = rdtsc();
+	}
 
 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 93d824ec3120..217cd4449bc9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void)
 	return new_head.first;
 }
 
-void mce_gen_pool_process(void)
+void mce_gen_pool_process(struct work_struct *__unused)
 {
 	struct llist_node *head;
 	struct mce_evt_llist *node, *tmp;
@@ -85,7 +85,7 @@ void mce_gen_pool_process(void)
 	head = llist_reverse_order(head);
 	llist_for_each_entry_safe(node, tmp, head, llnode) {
 		mce = &node->mce;
-		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+		blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
 		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
 	}
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 517619ea6498..99165b206df3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -152,7 +152,6 @@ static void raise_mce(struct mce *m)
 	if (context == MCJ_CTX_RANDOM)
 		return;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
 		unsigned long start;
 		int cpu;
@@ -192,9 +191,7 @@ static void raise_mce(struct mce *m)
 		raise_local();
 		put_cpu();
 		put_online_cpus();
-	} else
-#endif
-	{
+	} else {
 		preempt_disable();
 		raise_local();
 		preempt_enable();
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index cd74a3f00aea..19592ba1a320 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,7 +13,7 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };
 
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern struct blocking_notifier_head x86_mce_decoder_chain;
 
 #define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
@@ -31,7 +31,7 @@ struct mce_evt_llist {
 	struct mce mce;
 };
 
-void mce_gen_pool_process(void);
+void mce_gen_pool_process(struct work_struct *__unused);
 bool mce_gen_pool_empty(void);
 int mce_gen_pool_add(struct mce *mce);
 int mce_gen_pool_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 631356c8cca4..87cc9ab7a13c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/debugfs.h>
 #include <asm/mce.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "mce-internal.h"
 
@@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
 			*msg = s->msg;
 		s->covered = 1;
 		if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
-			if (panic_on_oops || tolerant < 1)
+			if (tolerant < 1)
 				return MCE_PANIC_SEVERITY;
 		}
 		return s->sev;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..af44ebeb593f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
 #include <linux/export.h>
 #include <linux/jump_label.h>
 
+#include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/traps.h>
 #include <asm/tlbflush.h>
@@ -53,6 +54,8 @@
 
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
+static int mce_chrdev_open_count;	/* #times opened */
+
 #define mce_log_get_idx_check(p) \
 ({ \
 	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
@@ -120,14 +123,13 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
  * CPU/chipset specific EDAC code can register a notifier call here to print
  * MCE errors in a human-readable form.
  */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
 
 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -135,6 +137,9 @@ void mce_setup(struct mce *m)
 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
 	m->apicid = cpu_data(m->extcpu).initial_apicid;
 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+	if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+		rdmsrl(MSR_PPIN, m->ppin);
 }
 
 DEFINE_PER_CPU(struct mce, injectm);
@@ -207,19 +212,23 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
 
 static struct notifier_block mce_srao_nb;
 
+static atomic_t num_notifiers;
+
 void mce_register_decode_chain(struct notifier_block *nb)
 {
-	/* Ensure SRAO notifier has the highest priority in the decode chain. */
-	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
-		nb->priority -= 1;
+	atomic_inc(&num_notifiers);
+
+	WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
 
-	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
 
 void mce_unregister_decode_chain(struct notifier_block *nb)
 {
-	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+	atomic_dec(&num_notifiers);
+
+	blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
 }
 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
 
@@ -270,17 +279,17 @@ struct mca_msr_regs msr_ops = {
 	.misc	= misc_reg
 };
 
-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
 {
-	int ret = 0;
-
-	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
-	       m->extcpu, m->mcgstatus, m->bank, m->status);
+	pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+		 m->extcpu,
+		 (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+		 m->mcgstatus, m->bank, m->status);
 
 	if (m->ip) {
 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
-				m->cs, m->ip);
+			m->cs, m->ip);
 
 		if (m->cs == __KERNEL_CS)
 			print_symbol("{%s}", m->ip);
@@ -308,15 +317,11 @@ static void print_mce(struct mce *m)
 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
 		cpu_data(m->extcpu).microcode);
+}
 
-	/*
-	 * Print out human-readable details about the MCE error,
-	 * (if the CPU has an implementation for that)
-	 */
-	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-	if (ret == NOTIFY_STOP)
-		return;
-
+static void print_mce(struct mce *m)
+{
+	__print_mce(m);
 	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 
@@ -499,7 +504,7 @@ int mce_available(struct cpuinfo_x86 *c)
 
 static void mce_schedule_work(void)
 {
-	if (!mce_gen_pool_empty() && keventd_up())
+	if (!mce_gen_pool_empty())
 		schedule_work(&mce_work);
 }
 
@@ -566,7 +571,37 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 }
 static struct notifier_block mce_srao_nb = {
 	.notifier_call	= srao_decode_notifier,
-	.priority = INT_MAX,
+	.priority	= MCE_PRIO_SRAO,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *m = (struct mce *)data;
+
+	if (!m)
+		return NOTIFY_DONE;
+
+	/*
+	 * Run the default notifier if we have only the SRAO
+	 * notifier and us registered.
+	 */
+	if (atomic_read(&num_notifiers) > 2)
+		return NOTIFY_DONE;
+
+	/* Don't print when mcelog is running */
+	if (mce_chrdev_open_count > 0)
+		return NOTIFY_DONE;
+
+	__print_mce(m);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+	.notifier_call	= mce_default_notifier,
+	/* lowest prio, we want it to run last. */
+	.priority	= MCE_PRIO_LOWEST,
 };
 
 /*
@@ -667,6 +702,9 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	mce_gather_info(&m, NULL);
 
+	if (flags & MCP_TIMESTAMP)
+		m.tsc = rdtsc();
+
 	for (i = 0; i < mca_cfg.banks; i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
@@ -674,14 +712,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		barrier();
 		m.status = mce_rdmsrl(msr_ops.status(i));
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
-
 		/*
 		 * Uncorrected or signalled events are handled by the exception
 		 * handler when it is enabled, so don't process those here.
@@ -696,9 +732,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 		mce_read_aux(&m, i);
 
-		if (!(flags & MCP_TIMESTAMP))
-			m.tsc = 0;
-
 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
 
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1109,6 +1142,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		goto out;
 
 	mce_gather_info(&m, regs);
+	m.tsc = rdtsc();
 
 	final = this_cpu_ptr(&mces_seen);
 	*final = m;
@@ -1275,41 +1309,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 #endif
 
 /*
- * Action optional processing happens here (picking up
- * from the list of faulting pages that do_machine_check()
- * placed into the genpool).
- */
-static void mce_process_work(struct work_struct *dummy)
-{
-	mce_gen_pool_process();
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
-	struct mce m;
-
-	mce_setup(&m);
-	m.bank = MCE_THERMAL_BANK;
-	m.status = status;
-	mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
  * Periodic polling timer for "silent" machine check errors.  If the
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
@@ -1326,20 +1325,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 
 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
 
-static void __restart_timer(struct timer_list *t, unsigned long interval)
+static void __start_timer(struct timer_list *t, unsigned long interval)
 {
 	unsigned long when = jiffies + interval;
 	unsigned long flags;
 
 	local_irq_save(flags);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer(t, when);
-	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
-	}
+	if (!timer_pending(t) || time_before(when, t->expires))
+		mod_timer(t, round_jiffies(when));
 
 	local_irq_restore(flags);
 }
@@ -1355,7 +1349,7 @@ static void mce_timer_fn(unsigned long data)
 	iv = __this_cpu_read(mce_next_interval);
 
 	if (mce_available(this_cpu_ptr(&cpu_info))) {
-		machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
 
 		if (mce_intel_cmci_poll()) {
 			iv = mce_adjust_timer(iv);
@@ -1374,7 +1368,7 @@ static void mce_timer_fn(unsigned long data)
 
 done:
 	__this_cpu_write(mce_next_interval, iv);
-	__restart_timer(t, iv);
+	__start_timer(t, iv);
 }
 
 /*
@@ -1385,7 +1379,7 @@ void mce_timer_kick(unsigned long interval)
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	__restart_timer(t, interval);
+	__start_timer(t, interval);
 
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
@@ -1732,17 +1726,23 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
 	}
 }
 
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(struct timer_list *t)
 {
 	unsigned long iv = check_interval * HZ;
 
 	if (mca_cfg.ignore_ce || !iv)
 		return;
 
-	per_cpu(mce_next_interval, cpu) = iv;
+	this_cpu_write(mce_next_interval, iv);
+	__start_timer(t, iv);
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
+	unsigned int cpu = smp_processor_id();
 
-	t->expires = round_jiffies(jiffies + iv);
-	add_timer_on(t, cpu);
+	setup_pinned_timer(t, mce_timer_fn, cpu);
 }
 
 static void __mcheck_cpu_init_timer(void)
@@ -1751,7 +1751,7 @@ static void __mcheck_cpu_init_timer(void)
 	unsigned int cpu = smp_processor_id();
 
 	setup_pinned_timer(t, mce_timer_fn, cpu);
-	mce_start_timer(cpu, t);
+	mce_start_timer(t);
 }
 
 /* Handle unconfigured int18 (should never happen) */
@@ -1796,7 +1796,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_clear_banks();
-	__mcheck_cpu_init_timer();
+	__mcheck_cpu_setup_timer();
 }
 
 /*
@@ -1823,7 +1823,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
  */
 
 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count;	/* #times opened */
 static int mce_chrdev_open_exclu;	/* already open exclusive? */
 
 static int mce_chrdev_open(struct inode *inode, struct file *file)
@@ -2138,9 +2137,10 @@ int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
 	mce_register_decode_chain(&mce_srao_nb);
+	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
 
-	INIT_WORK(&mce_work, mce_process_work);
+	INIT_WORK(&mce_work, mce_gen_pool_process);
 	init_irq_work(&mce_irq_work, mce_irq_work_cb);
 
 	return 0;
@@ -2255,8 +2255,6 @@ static struct bus_type mce_subsys = {
 
 DEFINE_PER_CPU(struct device *, mce_device);
 
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
 {
 	return container_of(attr, struct mce_bank, attr);
@@ -2409,6 +2407,10 @@ static int mce_device_create(unsigned int cpu)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
+	dev = per_cpu(mce_device, cpu);
+	if (dev)
+		return 0;
+
 	dev = kzalloc(sizeof *dev, GFP_KERNEL);
 	if (!dev)
 		return -ENOMEM;
@@ -2468,28 +2470,25 @@ static void mce_device_remove(unsigned int cpu)
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void mce_disable_cpu(void)
 {
-	unsigned long action = *(unsigned long *)h;
-
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 
-	if (!(action & CPU_TASKS_FROZEN))
+	if (!cpuhp_tasks_frozen)
 		cmci_clear();
 
 	vendor_disable_error_reporting();
 }
 
-static void mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void)
 {
-	unsigned long action = *(unsigned long *)h;
 	int i;
 
 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
 		return;
 
-	if (!(action & CPU_TASKS_FROZEN))
+	if (!cpuhp_tasks_frozen)
 		cmci_reenable();
 	for (i = 0; i < mca_cfg.banks; i++) {
 		struct mce_bank *b = &mce_banks[i];
@@ -2499,45 +2498,43 @@ static void mce_reenable_cpu(void *h)
 	}
 }
 
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int mce_cpu_dead(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-	struct timer_list *t = &per_cpu(mce_timer, cpu);
+	mce_intel_hcpu_update(cpu);
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		mce_device_create(cpu);
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
-		break;
-	case CPU_DEAD:
-		if (threshold_cpu_callback)
-			threshold_cpu_callback(action, cpu);
-		mce_device_remove(cpu);
-		mce_intel_hcpu_update(cpu);
+	/* intentionally ignoring frozen here */
+	if (!cpuhp_tasks_frozen)
+		cmci_rediscover();
+	return 0;
+}
 
-		/* intentionally ignoring frozen here */
-		if (!(action & CPU_TASKS_FROZEN))
-			cmci_rediscover();
-		break;
-	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
-		del_timer_sync(t);
-		break;
-	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
-		mce_start_timer(cpu, t);
-		break;
-	}
+static int mce_cpu_online(unsigned int cpu)
+{
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
+	int ret;
 
-	return NOTIFY_OK;
+	mce_device_create(cpu);
+
+	ret = mce_threshold_create_device(cpu);
+	if (ret) {
+		mce_device_remove(cpu);
+		return ret;
+	}
+	mce_reenable_cpu();
+	mce_start_timer(t);
+	return 0;
 }
 
-static struct notifier_block mce_cpu_notifier = {
-	.notifier_call = mce_cpu_callback,
-};
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+	struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+	mce_disable_cpu();
+	del_timer_sync(t);
+	mce_threshold_remove_device(cpu);
+	mce_device_remove(cpu);
+	return 0;
+}
 
 static __init void mce_init_banks(void)
 {
@@ -2559,8 +2556,8 @@ static __init void mce_init_banks(void)
 
 static __init int mcheck_init_device(void)
 {
+	enum cpuhp_state hp_online;
 	int err;
-	int i = 0;
 
 	if (!mce_available(&boot_cpu_data)) {
 		err = -EIO;
@@ -2578,23 +2575,16 @@ static __init int mcheck_init_device(void)
 	if (err)
 		goto err_out_mem;
 
-	cpu_notifier_register_begin();
-	for_each_online_cpu(i) {
-		err = mce_device_create(i);
-		if (err) {
-			/*
-			 * Register notifier anyway (and do not unreg it) so
-			 * that we don't leave undeleted timers, see notifier
-			 * callback above.
-			 */
-			__register_hotcpu_notifier(&mce_cpu_notifier);
-			cpu_notifier_register_done();
-			goto err_device_create;
-		}
-	}
+	err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+				mce_cpu_dead);
+	if (err)
+		goto err_out_mem;
 
-	__register_hotcpu_notifier(&mce_cpu_notifier);
-	cpu_notifier_register_done();
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+				mce_cpu_online, mce_cpu_pre_down);
+	if (err < 0)
+		goto err_out_online;
+	hp_online = err;
 
 	register_syscore_ops(&mce_syscore_ops);
 
@@ -2607,16 +2597,10 @@ static __init int mcheck_init_device(void)
 
 err_register:
 	unregister_syscore_ops(&mce_syscore_ops);
+	cpuhp_remove_state(hp_online);
 
-err_device_create:
-	/*
-	 * We didn't keep track of which devices were created above, but
-	 * even if we had, the set of online cpus might have changed.
-	 * Play safe and remove for every possible cpu, since
-	 * mce_device_remove() will do the right thing.
-	 */
-	for_each_possible_cpu(i)
-		mce_device_remove(i);
+err_out_online:
+	cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
 
 err_out_mem:
 	free_cpumask_var(mce_device_initialized);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9b5403462936..6e4a047e4b68 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -24,7 +24,6 @@
 
 #include <asm/amd_nb.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
@@ -55,11 +54,13 @@
 /* Threshold LVT offset is at MSR0xC0000410[15:12] */
 #define SMCA_THR_LVT_OFF	0xF000
 
+static bool thresholding_en;
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
 	"combined_unit",
-	"",
+	"decode_unit",
 	"northbridge",
 	"execution_unit",
 };
@@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = {
 	"misc_umc"
 };
 
-struct smca_bank_name smca_bank_names[] = {
+struct smca_bank_name {
+	const char *name;	/* Short name for sysfs */
+	const char *long_name;	/* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
 	[SMCA_LS]	= { "load_store",	"Load Store Unit" },
 	[SMCA_IF]	= { "insn_fetch",	"Instruction Fetch Unit" },
 	[SMCA_L2_CACHE]	= { "l2_cache",		"L2 Cache" },
@@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = {
 	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
 	[SMCA_SMU]	= { "smu",		"System Management Unit" },
 };
-EXPORT_SYMBOL_GPL(smca_bank_names);
 
-static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+const char *smca_get_name(enum smca_bank_types t)
+{
+	if (t >= N_SMCA_BANK_TYPES)
+		return NULL;
+
+	return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+	if (t >= N_SMCA_BANK_TYPES)
+		return NULL;
+
+	return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
 	/* { bank_type, hwid_mcatype, xec_bitmap } */
 
 	/* ZN Core (HWID=0xB0) MCA types */
@@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
 	{ SMCA_SMU,	 HWID_MCATYPE(0x01, 0x0), 0x1 },
 };
 
-struct smca_bank_info smca_banks[MAX_NR_BANKS];
+struct smca_bank smca_banks[MAX_NR_BANKS];
 EXPORT_SYMBOL_GPL(smca_banks);
 
 /*
@@ -142,35 +164,35 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
-/*
- * CPU Initialization
- */
-
 static void get_smca_bank_info(unsigned int bank)
 {
 	unsigned int i, hwid_mcatype, cpu = smp_processor_id();
-	struct smca_hwid_mcatype *type;
-	u32 high, instanceId;
-	u16 hwid, mcatype;
+	struct smca_hwid *s_hwid;
+	u32 high, instance_id;
 
 	/* Collect bank_info using CPU 0 for now. */
 	if (cpu)
 		return;
 
-	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
 		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
 		return;
 	}
 
-	hwid = high & MCI_IPID_HWID;
-	mcatype = (high & MCI_IPID_MCATYPE) >> 16;
-	hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+	hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+				    (high & MCI_IPID_MCATYPE) >> 16);
 
 	for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
-		type = &smca_hwid_mcatypes[i];
-		if (hwid_mcatype == type->hwid_mcatype) {
-			smca_banks[bank].type = type;
-			smca_banks[bank].type_instance = instanceId;
+		s_hwid = &smca_hwid_mcatypes[i];
+		if (hwid_mcatype == s_hwid->hwid_mcatype) {
+
+			WARN(smca_banks[bank].hwid,
+			     "Bank %s already initialized!\n",
+			     smca_get_name(s_hwid->bank_type));
+
+			smca_banks[bank].hwid = s_hwid;
+			smca_banks[bank].id = instance_id;
+			smca_banks[bank].sysfs_id = s_hwid->count++;
 			break;
 		}
 	}
@@ -533,6 +555,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 		deferred_error_interrupt_enable(c);
 }
 
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+	u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+	/* We start from the normalized address */
+	u64 ret_addr = norm_addr;
+
+	u32 tmp;
+
+	u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+	u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+	u8 intlv_addr_sel, intlv_addr_bit;
+	u8 num_intlv_bits, hashed_bit;
+	u8 lgcy_mmio_hole_en, base = 0;
+	u8 cs_mask, cs_id = 0;
+	bool hash_enabled = false;
+
+	/* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+	if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+		goto out_err;
+
+	/* Remove HiAddrOffset from normalized address, if enabled: */
+	if (tmp & BIT(0)) {
+		u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+		if (norm_addr >= hi_addr_offset) {
+			ret_addr -= hi_addr_offset;
+			base = 1;
+		}
+	}
+
+	/* Read D18F0x110 (DramBaseAddress). */
+	if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+		goto out_err;
+
+	/* Check if address range is valid. */
+	if (!(tmp & BIT(0))) {
+		pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+			__func__, tmp);
+		goto out_err;
+	}
+
+	lgcy_mmio_hole_en = tmp & BIT(1);
+	intlv_num_chan	  = (tmp >> 4) & 0xF;
+	intlv_addr_sel	  = (tmp >> 8) & 0x7;
+	dram_base_addr	  = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+	/* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+	if (intlv_addr_sel > 3) {
+		pr_err("%s: Invalid interleave address select %d.\n",
+			__func__, intlv_addr_sel);
+		goto out_err;
+	}
+
+	/* Read D18F0x114 (DramLimitAddress). */
+	if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+		goto out_err;
+
+	intlv_num_sockets = (tmp >> 8) & 0x1;
+	intlv_num_dies	  = (tmp >> 10) & 0x3;
+	dram_limit_addr	  = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+	intlv_addr_bit = intlv_addr_sel + 8;
+
+	/* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+	switch (intlv_num_chan) {
+	case 0:	intlv_num_chan = 0; break;
+	case 1: intlv_num_chan = 1; break;
+	case 3: intlv_num_chan = 2; break;
+	case 5:	intlv_num_chan = 3; break;
+	case 7:	intlv_num_chan = 4; break;
+
+	case 8: intlv_num_chan = 1;
+		hash_enabled = true;
+		break;
+	default:
+		pr_err("%s: Invalid number of interleaved channels %d.\n",
+			__func__, intlv_num_chan);
+		goto out_err;
+	}
+
+	num_intlv_bits = intlv_num_chan;
+
+	if (intlv_num_dies > 2) {
+		pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+			__func__, intlv_num_dies);
+		goto out_err;
+	}
+
+	num_intlv_bits += intlv_num_dies;
+
+	/* Add a bit if sockets are interleaved. */
+	num_intlv_bits += intlv_num_sockets;
+
+	/* Assert num_intlv_bits <= 4 */
+	if (num_intlv_bits > 4) {
+		pr_err("%s: Invalid interleave bits %d.\n",
+			__func__, num_intlv_bits);
+		goto out_err;
+	}
+
+	if (num_intlv_bits > 0) {
+		u64 temp_addr_x, temp_addr_i, temp_addr_y;
+		u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+		/*
+		 * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+		 * This is the fabric id for this coherent slave. Use
+		 * umc/channel# as instance id of the coherent slave
+		 * for FICAA.
+		 */
+		if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+			goto out_err;
+
+		cs_fabric_id = (tmp >> 8) & 0xFF;
+		die_id_bit   = 0;
+
+		/* If interleaved over more than 1 channel: */
+		if (intlv_num_chan) {
+			die_id_bit = intlv_num_chan;
+			cs_mask	   = (1 << die_id_bit) - 1;
+			cs_id	   = cs_fabric_id & cs_mask;
+		}
+
+		sock_id_bit = die_id_bit;
+
+		/* Read D18F1x208 (SystemFabricIdMask). */
+		if (intlv_num_dies || intlv_num_sockets)
+			if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+				goto out_err;
+
+		/* If interleaved over more than 1 die. */
+		if (intlv_num_dies) {
+			sock_id_bit  = die_id_bit + intlv_num_dies;
+			die_id_shift = (tmp >> 24) & 0xF;
+			die_id_mask  = (tmp >> 8) & 0xFF;
+
+			cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+		}
+
+		/* If interleaved over more than 1 socket. */
+		if (intlv_num_sockets) {
+			socket_id_shift	= (tmp >> 28) & 0xF;
+			socket_id_mask	= (tmp >> 16) & 0xFF;
+
+			cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+		}
+
+		/*
+		 * The pre-interleaved address consists of XXXXXXIIIYYYYY
+		 * where III is the ID for this CS, and XXXXXXYYYYY are the
+		 * address bits from the post-interleaved address.
+		 * "num_intlv_bits" has been calculated to tell us how many "I"
+		 * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+		 * there are (where "I" starts).
+		 */
+		temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+		temp_addr_i = (cs_id << intlv_addr_bit);
+		temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+		ret_addr    = temp_addr_x | temp_addr_i | temp_addr_y;
+	}
+
+	/* Add dram base address */
+	ret_addr += dram_base_addr;
+
+	/* If legacy MMIO hole enabled */
+	if (lgcy_mmio_hole_en) {
+		if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+			goto out_err;
+
+		dram_hole_base = tmp & GENMASK(31, 24);
+		if (ret_addr >= dram_hole_base)
+			ret_addr += (BIT_ULL(32) - dram_hole_base);
+	}
+
+	if (hash_enabled) {
+		/* Save some parentheses and grab ls-bit at the end. */
+		hashed_bit =	(ret_addr >> 12) ^
+				(ret_addr >> 18) ^
+				(ret_addr >> 21) ^
+				(ret_addr >> 30) ^
+				cs_id;
+
+		hashed_bit &= BIT(0);
+
+		if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+			ret_addr ^= BIT(intlv_addr_bit);
+	}
+
+	/* Is calculated system address is above DRAM limit address? */
+	if (ret_addr > dram_limit_addr)
+		goto out_err;
+
+	*sys_addr = ret_addr;
+	return 0;
+
+out_err:
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
 static void
 __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 {
@@ -556,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	mce_setup(&m);
 
 	m.status = status;
-	m.bank = bank;
+	m.bank   = bank;
+	m.tsc	 = rdtsc();
 
 	if (threshold_err)
 		m.misc = misc;
@@ -593,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void)
 	deferred_error_int_vector();
 }
 
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
 {
 	entering_irq();
 	__smp_deferred_error_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 {
 	entering_irq();
 	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -645,6 +868,7 @@ static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
 	unsigned int bank, block, cpu = smp_processor_id();
+	struct thresh_restart tr;
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -681,6 +905,11 @@ static void amd_threshold_interrupt(void)
 
 log:
 	__log_error(bank, false, true, ((u64)high << 32) | low);
+
+	/* Reset threshold block after logging error. */
+	memset(&tr, 0, sizeof(tr));
+	tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+	threshold_restart_bank(&tr);
 }
 
 /*
@@ -826,10 +1055,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 		return th_names[bank];
 	}
 
-	if (!smca_banks[bank].type)
+	if (!smca_banks[bank].hwid)
 		return NULL;
 
-	bank_type = smca_banks[bank].type->bank_type;
+	bank_type = smca_banks[bank].hwid->bank_type;
 
 	if (b && bank_type == SMCA_UMC) {
 		if (b->block < ARRAY_SIZE(smca_umc_block_names))
@@ -837,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
 		return NULL;
 	}
 
+	if (smca_banks[bank].hwid->count == 1)
+		return smca_get_name(bank_type);
+
 	snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
-		 "%s_%x", smca_bank_names[bank_type].name,
-			  smca_banks[bank].type_instance);
+		 "%s_%x", smca_get_name(bank_type),
+			  smca_banks[bank].sysfs_id);
 	return buf_mcatype;
 }
 
@@ -955,6 +1187,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	const char *name = get_name(bank, NULL);
 	int err = 0;
 
+	if (!dev)
+		return -ENODEV;
+
 	if (is_shared_bank(bank)) {
 		nb = node_to_amd_nb(amd_get_nb_id(cpu));
 
@@ -1010,31 +1245,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	return err;
 }
 
-/* create dir/files for all valid threshold banks */
-static int threshold_create_device(unsigned int cpu)
-{
-	unsigned int bank;
-	struct threshold_bank **bp;
-	int err = 0;
-
-	bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
-		     GFP_KERNEL);
-	if (!bp)
-		return -ENOMEM;
-
-	per_cpu(threshold_banks, cpu) = bp;
-
-	for (bank = 0; bank < mca_cfg.banks; ++bank) {
-		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
-			continue;
-		err = threshold_create_bank(cpu, bank);
-		if (err)
-			return err;
-	}
-
-	return err;
-}
-
 static void deallocate_threshold_block(unsigned int cpu,
 						 unsigned int bank)
 {
@@ -1102,48 +1312,71 @@ free_out:
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
 }
 
-static void threshold_remove_device(unsigned int cpu)
+int mce_threshold_remove_device(unsigned int cpu)
 {
 	unsigned int bank;
 
+	if (!thresholding_en)
+		return 0;
+
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
 			continue;
 		threshold_remove_bank(cpu, bank);
 	}
 	kfree(per_cpu(threshold_banks, cpu));
+	per_cpu(threshold_banks, cpu) = NULL;
+	return 0;
 }
 
-/* get notified when a cpu comes on/off */
-static void
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
 {
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		threshold_create_device(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		threshold_remove_device(cpu);
-		break;
-	default:
-		break;
+	unsigned int bank;
+	struct threshold_bank **bp;
+	int err = 0;
+
+	if (!thresholding_en)
+		return 0;
+
+	bp = per_cpu(threshold_banks, cpu);
+	if (bp)
+		return 0;
+
+	bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+		     GFP_KERNEL);
+	if (!bp)
+		return -ENOMEM;
+
+	per_cpu(threshold_banks, cpu) = bp;
+
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+			continue;
+		err = threshold_create_bank(cpu, bank);
+		if (err)
+			goto err;
 	}
+	return err;
+err:
+	mce_threshold_remove_device(cpu);
+	return err;
 }
 
 static __init int threshold_init_device(void)
 {
 	unsigned lcpu = 0;
 
+	if (mce_threshold_vector == amd_threshold_interrupt)
+		thresholding_en = true;
+
 	/* to hit CPUs online before the notifier is up */
 	for_each_online_cpu(lcpu) {
-		int err = threshold_create_device(lcpu);
+		int err = mce_threshold_create_device(lcpu);
 
 		if (err)
 			return err;
 	}
-	threshold_cpu_callback = amd_64_threshold_cpu_callback;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..190b3e6cef4d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -11,6 +11,8 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void)
 	 * Reset the counter if we've logged an error in the last poll
 	 * during the storm.
 	 */
-	if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+	if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
 		this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
 	else
 		this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +344,7 @@ void cmci_recheck(void)
 		return;
 
 	local_irq_save(flags);
-	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+	machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
 	local_irq_restore(flags);
 }
 
@@ -464,11 +466,46 @@ static void intel_clear_lmce(void)
 	wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
 }
 
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+	unsigned long long val;
+
+	/*
+	 * Even if testing the presence of the MSR would be enough, we don't
+	 * want to risk the situation where other models reuse this MSR for
+	 * other purposes.
+	 */
+	switch (c->x86_model) {
+	case INTEL_FAM6_IVYBRIDGE_X:
+	case INTEL_FAM6_HASWELL_X:
+	case INTEL_FAM6_BROADWELL_XEON_D:
+	case INTEL_FAM6_BROADWELL_X:
+	case INTEL_FAM6_SKYLAKE_X:
+		if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+			return;
+
+		if ((val & 3UL) == 1UL) {
+			/* PPIN available but disabled: */
+			return;
+		}
+
+		/* If PPIN is disabled, but not locked, try to enable: */
+		if (!(val & 3UL)) {
+			wrmsrl_safe(MSR_PPIN_CTL,  val | 2UL);
+			rdmsrl_safe(MSR_PPIN_CTL, &val);
+		}
+
+		if ((val & 3UL) == 2UL)
+			set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+	}
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
 	intel_init_lmce();
+	intel_ppin_init(c);
 }
 
 void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6b9dc4d18ccc..d7cc190ae457 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -6,7 +6,7 @@
  *
  * Maintains a counter in /sys that keeps track of the number of thermal
  * events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
+ * (since the logging to syslog is rate limited).
  *
  * Author: Dmitriy Zavin (dmitriyz@google.com)
  *
@@ -26,7 +26,6 @@
 
 #include <asm/processor.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/trace/irq_vectors.h>
@@ -142,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
  * IRQ has been acknowledged.
  *
  * It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- *              "timeout" from previous log message.
- *          1 : Event should be logged further, and a message has been
- *              printed to the syslog.
  */
-static int therm_throt_process(bool new_event, int event, int level)
+static void therm_throt_process(bool new_event, int event, int level)
 {
 	struct _thermal_state *state;
 	unsigned int this_cpu = smp_processor_id();
@@ -163,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
 		else if (event == POWER_LIMIT_EVENT)
 			state = &pstate->core_power_limit;
 		else
-			 return 0;
+			return;
 	} else if (level == PACKAGE_LEVEL) {
 		if (event == THERMAL_THROTTLING_EVENT)
 			state = &pstate->package_throttle;
 		else if (event == POWER_LIMIT_EVENT)
 			state = &pstate->package_power_limit;
 		else
-			return 0;
+			return;
 	} else
-		return 0;
+		return;
 
 	old_event = state->new_event;
 	state->new_event = new_event;
@@ -182,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
 
 	if (time_before64(now, state->next_check) &&
 			state->count != state->last_count)
-		return 0;
+		return;
 
 	state->next_check = now + CHECK_INTERVAL;
 	state->last_count = state->count;
@@ -194,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
-		return 1;
+		return;
 	}
 	if (old_event) {
 		if (event == THERMAL_THROTTLING_EVENT)
 			pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package");
-		return 1;
+		return;
 	}
-
-	return 0;
 }
 
 static int thresh_event_valid(int level, int event)
@@ -271,58 +263,32 @@ static void thermal_throttle_remove_dev(struct device *dev)
 }
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
-			      unsigned long action,
-			      void *hcpu)
+static int thermal_throttle_online(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-	struct device *dev;
-	int err = 0;
-
-	dev = get_cpu_device(cpu);
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		err = thermal_throttle_add_dev(dev, cpu);
-		WARN_ON(err);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		thermal_throttle_remove_dev(dev);
-		break;
-	}
-	return notifier_from_errno(err);
+	struct device *dev = get_cpu_device(cpu);
+
+	return thermal_throttle_add_dev(dev, cpu);
 }
 
-static struct notifier_block thermal_throttle_cpu_notifier =
+static int thermal_throttle_offline(unsigned int cpu)
 {
-	.notifier_call = thermal_throttle_cpu_callback,
-};
+	struct device *dev = get_cpu_device(cpu);
+
+	thermal_throttle_remove_dev(dev);
+	return 0;
+}
 
 static __init int thermal_throttle_init_device(void)
 {
-	unsigned int cpu = 0;
-	int err;
+	int ret;
 
 	if (!atomic_read(&therm_throt_en))
 		return 0;
 
-	cpu_notifier_register_begin();
-
-	/* connect live CPUs to sysfs */
-	for_each_online_cpu(cpu) {
-		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
-		WARN_ON(err);
-	}
-
-	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
-	cpu_notifier_register_done();
-
-	return 0;
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+				thermal_throttle_online,
+				thermal_throttle_offline);
+	return ret < 0 ? ret : 0;
 }
 device_initcall(thermal_throttle_init_device);
 
@@ -392,10 +358,9 @@ static void intel_thermal_interrupt(void)
 	/* Check for violation of core thermal thresholds*/
 	notify_thresholds(msr_val);
 
-	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
-				THERMAL_THROTTLING_EVENT,
-				CORE_LEVEL) != 0)
-		mce_log_therm_throt_event(msr_val);
+	therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+			    THERMAL_THROTTLING_EVENT,
+			    CORE_LEVEL);
 
 	if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
 		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
@@ -431,14 +396,16 @@ static inline void __smp_thermal_interrupt(void)
 	smp_thermal_vector();
 }
 
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	__smp_thermal_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
 {
 	entering_irq();
 	trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fcf9ae9384f4..bb0e75eed10a 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -6,7 +6,6 @@
 
 #include <asm/irq_vectors.h>
 #include <asm/apic.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 
@@ -24,14 +23,14 @@ static inline void __smp_threshold_interrupt(void)
 	mce_threshold_vector();
 }
 
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
 {
 	entering_irq();
 	__smp_threshold_interrupt();
 	exiting_ack_irq();
 }
 
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
 {
 	entering_irq();
 	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 220b1a508513..ba12e8aa4a45 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,4 +1,4 @@
 microcode-y				:= core.o
 obj-$(CONFIG_MICROCODE)			+= microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o
 microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 017bda12caae..7889ae492af0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -5,6 +5,7 @@
  *  CPUs and later.
  *
  *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ *	          2013-2016 Borislav Petkov <bp@alien8.de>
  *
  *  Author: Peter Oruba <peter.oruba@amd.com>
  *
@@ -39,100 +40,149 @@
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
-struct ucode_patch {
-	struct list_head plist;
-	void *data;
-	u32 patch_id;
-	u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
 /*
  * This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
  */
-static u8 *container;
-static size_t container_size;
-static bool ucode_builtin;
+struct cont_desc {
+	struct microcode_amd *mc;
+	u32		     cpuid_1_eax;
+	u32		     psize;
+	u8		     *data;
+	size_t		     size;
+};
 
 static u32 ucode_new_rev;
 static u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
 
-static struct cpio_data ucode_cpio;
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
 
-static struct cpio_data __init find_ucode_in_initrd(void)
+static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
 {
-#ifdef CONFIG_BLK_DEV_INITRD
-	char *path;
-	void *start;
-	size_t size;
+	for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
+		if (sig == equiv_table->installed_cpu)
+			return equiv_table->equiv_cpu;
+	}
 
-	/*
-	 * Microcode patch container file is prepended to the initrd in cpio
-	 * format. See Documentation/x86/early-microcode.txt
-	 */
-	static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+	return 0;
+}
 
-#ifdef CONFIG_X86_32
-	struct boot_params *p;
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
+{
+	struct equiv_cpu_entry *eq;
+	ssize_t orig_size = size;
+	u32 *hdr = (u32 *)ucode;
+	u16 eq_id;
+	u8 *buf;
 
-	/*
-	 * On 32-bit, early load occurs before paging is turned on so we need
-	 * to use physical addresses.
-	 */
-	p       = (struct boot_params *)__pa_nodebug(&boot_params);
-	path    = (char *)__pa_nodebug(ucode_path);
-	start   = (void *)p->hdr.ramdisk_image;
-	size    = p->hdr.ramdisk_size;
-#else
-	path    = ucode_path;
-	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
-	size    = boot_params.hdr.ramdisk_size;
-#endif /* !CONFIG_X86_32 */
+	/* Am I looking at an equivalence table header? */
+	if (hdr[0] != UCODE_MAGIC ||
+	    hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
+	    hdr[2] == 0)
+		return CONTAINER_HDR_SZ;
 
-	return find_cpio_data(path, start, size, NULL);
-#else
-	return (struct cpio_data){ NULL, 0, "" };
-#endif
-}
+	buf = ucode;
 
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
-	size_t size = 0;
-	u32 *header = (u32 *)data;
+	eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
 
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return size;
+	/* Find the equivalence ID of our CPU in this table: */
+	eq_id = find_equiv_id(eq, desc->cpuid_1_eax);
 
-	size = header[2] + CONTAINER_HDR_SZ;
-	total_size -= size;
-	data += size;
+	buf  += hdr[2] + CONTAINER_HDR_SZ;
+	size -= hdr[2] + CONTAINER_HDR_SZ;
 
-	while (total_size) {
-		u16 patch_size;
+	/*
+	 * Scan through the rest of the container to find where it ends. We do
+	 * some basic sanity-checking too.
+	 */
+	while (size > 0) {
+		struct microcode_amd *mc;
+		u32 patch_size;
 
-		header = (u32 *)data;
+		hdr = (u32 *)buf;
 
-		if (header[0] != UCODE_UCODE_TYPE)
+		if (hdr[0] != UCODE_UCODE_TYPE)
 			break;
 
-		/*
-		 * Sanity-check patch size.
-		 */
-		patch_size = header[1];
+		/* Sanity-check patch size. */
+		patch_size = hdr[1];
 		if (patch_size > PATCH_MAX_SIZE)
 			break;
 
-		size	   += patch_size + SECTION_HDR_SIZE;
-		data	   += patch_size + SECTION_HDR_SIZE;
-		total_size -= patch_size + SECTION_HDR_SIZE;
+		/* Skip patch section header: */
+		buf  += SECTION_HDR_SIZE;
+		size -= SECTION_HDR_SIZE;
+
+		mc = (struct microcode_amd *)buf;
+		if (eq_id == mc->hdr.processor_rev_id) {
+			desc->psize = patch_size;
+			desc->mc = mc;
+		}
+
+		buf  += patch_size;
+		size -= patch_size;
 	}
 
-	return size;
+	/*
+	 * If we have found a patch (desc->mc), it means we're looking at the
+	 * container which has a patch for this CPU so return 0 to mean, @ucode
+	 * already points to the proper container. Otherwise, we return the size
+	 * we scanned so that we can advance to the next container in the
+	 * buffer.
+	 */
+	if (desc->mc) {
+		desc->data = ucode;
+		desc->size = orig_size - size;
+
+		return 0;
+	}
+
+	return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+	ssize_t rem = size;
+
+	while (rem >= 0) {
+		ssize_t s = parse_container(ucode, rem, desc);
+		if (!s)
+			return;
+
+		ucode += s;
+		rem   -= s;
+	}
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc)
+{
+	u32 rev, dummy;
+
+	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+
+	/* verify patch application was successful */
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	if (rev != mc->hdr.patch_id)
+		return -1;
+
+	return 0;
 }
 
 /*
@@ -143,128 +193,50 @@ static size_t compute_container_size(u8 *data, u32 total_size)
  * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
  * load_microcode_amd() to save equivalent cpu table and microcode patches in
  * kernel heap memory.
+ *
+ * Returns true if container found (sets @desc), false otherwise.
  */
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+static bool
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
 {
-	struct equiv_cpu_entry *eq;
-	size_t *cont_sz;
-	u32 *header;
-	u8  *data, **cont;
+	struct cont_desc desc = { 0 };
 	u8 (*patch)[PATCH_MAX_SIZE];
-	u16 eq_id = 0;
-	int offset, left;
-	u32 rev, eax, ebx, ecx, edx;
-	u32 *new_rev;
+	struct microcode_amd *mc;
+	u32 rev, dummy, *new_rev;
+	bool ret = false;
 
 #ifdef CONFIG_X86_32
 	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
-	cont_sz = (size_t *)__pa_nodebug(&container_size);
-	cont	= (u8 **)__pa_nodebug(&container);
 	patch	= (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
 #else
 	new_rev = &ucode_new_rev;
-	cont_sz = &container_size;
-	cont	= &container;
 	patch	= &amd_ucode_patch;
 #endif
 
-	data   = ucode;
-	left   = size;
-	header = (u32 *)data;
-
-	/* find equiv cpu table */
-	if (header[0] != UCODE_MAGIC ||
-	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
-	    header[2] == 0)                            /* size */
-		return;
+	desc.cpuid_1_eax = cpuid_1_eax;
 
-	eax = 0x00000001;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
+	scan_containers(ucode, size, &desc);
 
-	while (left > 0) {
-		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
-		*cont = data;
-
-		/* Advance past the container header */
-		offset = header[2] + CONTAINER_HDR_SZ;
-		data  += offset;
-		left  -= offset;
-
-		eq_id = find_equiv_id(eq, eax);
-		if (eq_id) {
-			this_equiv_id = eq_id;
-			*cont_sz = compute_container_size(*cont, left + offset);
-
-			/*
-			 * truncate how much we need to iterate over in the
-			 * ucode update loop below
-			 */
-			left = *cont_sz - offset;
-			break;
-		}
+	mc = desc.mc;
+	if (!mc)
+		return ret;
 
-		/*
-		 * support multiple container files appended together. if this
-		 * one does not have a matching equivalent cpu entry, we fast
-		 * forward to the next container file.
-		 */
-		while (left > 0) {
-			header = (u32 *)data;
-			if (header[0] == UCODE_MAGIC &&
-			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
-				break;
-
-			offset = header[1] + SECTION_HDR_SIZE;
-			data  += offset;
-			left  -= offset;
-		}
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+	if (rev >= mc->hdr.patch_id)
+		return ret;
 
-		/* mark where the next microcode container file starts */
-		offset    = data - (u8 *)ucode;
-		ucode     = data;
-	}
+	if (!__apply_microcode_amd(mc)) {
+		*new_rev = mc->hdr.patch_id;
+		ret      = true;
 
-	if (!eq_id) {
-		*cont = NULL;
-		*cont_sz = 0;
-		return;
+		if (save_patch)
+			memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
 	}
 
-	if (check_current_patch_level(&rev, true))
-		return;
-
-	while (left > 0) {
-		struct microcode_amd *mc;
-
-		header = (u32 *)data;
-		if (header[0] != UCODE_UCODE_TYPE || /* type */
-		    header[1] == 0)                  /* size */
-			break;
-
-		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
-		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
-			if (!__apply_microcode_amd(mc)) {
-				rev = mc->hdr.patch_id;
-				*new_rev = rev;
-
-				if (save_patch)
-					memcpy(patch, mc,
-					       min_t(u32, header[1], PATCH_MAX_SIZE));
-			}
-		}
-
-		offset  = header[1] + SECTION_HDR_SIZE;
-		data   += offset;
-		left   -= offset;
-	}
+	return ret;
 }
 
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
-					      unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
 {
 #ifdef CONFIG_X86_64
 	char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -279,207 +251,113 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
 #endif
 }
 
-void __init load_ucode_amd_bsp(unsigned int family)
+void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
 {
+	struct ucode_cpu_info *uci;
 	struct cpio_data cp;
-	bool *builtin;
-	void **data;
-	size_t *size;
-
-#ifdef CONFIG_X86_32
-	data =  (void **)__pa_nodebug(&ucode_cpio.data);
-	size = (size_t *)__pa_nodebug(&ucode_cpio.size);
-	builtin = (bool *)__pa_nodebug(&ucode_builtin);
-#else
-	data = &ucode_cpio.data;
-	size = &ucode_cpio.size;
-	builtin = &ucode_builtin;
-#endif
+	const char *path;
+	bool use_pa;
 
-	*builtin = load_builtin_amd_microcode(&cp, family);
-	if (!*builtin)
-		cp = find_ucode_in_initrd();
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		uci	= (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+		path	= (const char *)__pa_nodebug(ucode_path);
+		use_pa	= true;
+	} else {
+		uci     = ucode_cpu_info;
+		path	= ucode_path;
+		use_pa	= false;
+	}
 
-	if (!(cp.data && cp.size))
-		return;
+	if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+		cp = find_microcode_in_initrd(path, use_pa);
 
-	*data = cp.data;
-	*size = cp.size;
+	/* Needed in load_microcode_amd() */
+	uci->cpu_sig.sig = cpuid_1_eax;
 
-	apply_ucode_in_initrd(cp.data, cp.size, true);
+	*ret = cp;
 }
 
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
 {
-	struct microcode_amd *mc;
-	size_t *usize;
-	void **ucode;
-
-	mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
-	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
-		__apply_microcode_amd(mc);
-		return;
-	}
+	struct cpio_data cp = { };
 
-	ucode = (void *)__pa_nodebug(&container);
-	usize = (size_t *)__pa_nodebug(&container_size);
-
-	if (!*ucode || !*usize)
+	__load_ucode_amd(cpuid_1_eax, &cp);
+	if (!(cp.data && cp.size))
 		return;
 
-	apply_ucode_in_initrd(*ucode, *usize, false);
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
 }
 
-static void __init collect_cpu_sig_on_bsp(void *arg)
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
 {
-	unsigned int cpu = smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
-	unsigned int bsp = boot_cpu_data.cpu_index;
-	struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
-	if (!uci->cpu_sig.sig)
-		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-}
-#else
-void load_ucode_amd_ap(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct equiv_cpu_entry *eq;
 	struct microcode_amd *mc;
-	u8 *cont = container;
-	u32 rev, eax;
-	u16 eq_id;
-
-	/* Exit if called on the BSP. */
-	if (!cpu)
-		return;
-
-	if (!container)
-		return;
-
-	/*
-	 * 64-bit runs with paging enabled, thus early==false.
-	 */
-	if (check_current_patch_level(&rev, false))
-		return;
-
-	/* Add CONFIG_RANDOMIZE_MEMORY offset. */
-	if (!ucode_builtin)
-		cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
-
-	eax = cpuid_eax(0x00000001);
-	eq  = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
+	struct cpio_data cp;
+	u32 *new_rev, rev, dummy;
 
-	eq_id = find_equiv_id(eq, eax);
-	if (!eq_id)
-		return;
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		mc	= (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+		new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+	} else {
+		mc	= (struct microcode_amd *)amd_ucode_patch;
+		new_rev = &ucode_new_rev;
+	}
 
-	if (eq_id == this_equiv_id) {
-		mc = (struct microcode_amd *)amd_ucode_patch;
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 
-		if (mc && rev < mc->hdr.patch_id) {
-			if (!__apply_microcode_amd(mc))
-				ucode_new_rev = mc->hdr.patch_id;
+	/* Check whether we have saved a new patch already: */
+	if (*new_rev && rev < mc->hdr.patch_id) {
+		if (!__apply_microcode_amd(mc)) {
+			*new_rev = mc->hdr.patch_id;
+			return;
 		}
+	}
 
-	} else {
-		if (!ucode_cpio.data)
-			return;
+	__load_ucode_amd(cpuid_1_eax, &cp);
+	if (!(cp.data && cp.size))
+		return;
 
-		/*
-		 * AP has a different equivalence ID than BSP, looks like
-		 * mixed-steppings silicon so go through the ucode blob anew.
-		 */
-		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
-	}
+	apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
 }
-#endif
 
-int __init save_microcode_in_initrd_amd(void)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+
+int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
 {
-	unsigned long cont;
-	int retval = 0;
+	struct cont_desc desc = { 0 };
 	enum ucode_state ret;
-	u8 *cont_va;
-	u32 eax;
+	struct cpio_data cp;
 
-	if (!container)
+	cp = find_microcode_in_initrd(ucode_path, false);
+	if (!(cp.data && cp.size))
 		return -EINVAL;
 
-#ifdef CONFIG_X86_32
-	get_bsp_sig();
-	cont	= (unsigned long)container;
-	cont_va = __va(container);
-#else
-	/*
-	 * We need the physical address of the container for both bitness since
-	 * boot_params.hdr.ramdisk_image is a physical address.
-	 */
-	cont    = __pa_nodebug(container);
-	cont_va = container;
-#endif
-
-	/*
-	 * Take into account the fact that the ramdisk might get relocated and
-	 * therefore we need to recompute the container's position in virtual
-	 * memory space.
-	 */
-	if (relocated_ramdisk)
-		container = (u8 *)(__va(relocated_ramdisk) +
-			     (cont - boot_params.hdr.ramdisk_image));
-	else
-		container = cont_va;
-
-	/* Add CONFIG_RANDOMIZE_MEMORY offset. */
-	if (!ucode_builtin)
-		container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+	desc.cpuid_1_eax = cpuid_1_eax;
 
-	eax   = cpuid_eax(0x00000001);
-	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+	scan_containers(cp.data, cp.size, &desc);
+	if (!desc.mc)
+		return -EINVAL;
 
-	ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+	ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
+				 desc.data, desc.size);
 	if (ret != UCODE_OK)
-		retval = -EINVAL;
-
-	/*
-	 * This will be freed any msec now, stash patches for the current
-	 * family and switch to patch cache for cpu hotplug, etc later.
-	 */
-	container = NULL;
-	container_size = 0;
+		return -EINVAL;
 
-	return retval;
+	return 0;
 }
 
 void reload_ucode_amd(void)
 {
 	struct microcode_amd *mc;
-	u32 rev;
+	u32 rev, dummy;
 
-	/*
-	 * early==false because this is a syscore ->resume path and by
-	 * that time paging is long enabled.
-	 */
-	if (check_current_patch_level(&rev, false))
+	mc = (struct microcode_amd *)amd_ucode_patch;
+	if (!mc)
 		return;
 
-	mc = (struct microcode_amd *)amd_ucode_patch;
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 
-	if (mc && rev < mc->hdr.patch_id) {
+	if (rev < mc->hdr.patch_id) {
 		if (!__apply_microcode_amd(mc)) {
 			ucode_new_rev = mc->hdr.patch_id;
 			pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
@@ -513,7 +391,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
 {
 	struct ucode_patch *p;
 
-	list_for_each_entry(p, &pcache, plist)
+	list_for_each_entry(p, &microcode_cache, plist)
 		if (p->equiv_cpu == equiv_cpu)
 			return p;
 	return NULL;
@@ -523,7 +401,7 @@ static void update_cache(struct ucode_patch *new_patch)
 {
 	struct ucode_patch *p;
 
-	list_for_each_entry(p, &pcache, plist) {
+	list_for_each_entry(p, &microcode_cache, plist) {
 		if (p->equiv_cpu == new_patch->equiv_cpu) {
 			if (p->patch_id >= new_patch->patch_id)
 				/* we already have the latest patch */
@@ -536,14 +414,14 @@ static void update_cache(struct ucode_patch *new_patch)
 		}
 	}
 	/* no patch found, add it */
-	list_add_tail(&new_patch->plist, &pcache);
+	list_add_tail(&new_patch->plist, &microcode_cache);
 }
 
 static void free_cache(void)
 {
 	struct ucode_patch *p, *tmp;
 
-	list_for_each_entry_safe(p, tmp, &pcache, plist) {
+	list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
 		__list_del(p->plist.prev, p->plist.next);
 		kfree(p->data);
 		kfree(p);
@@ -616,74 +494,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 	return patch_size;
 }
 
-/*
- * Those patch levels cannot be updated to newer ones and thus should be final.
- */
-static u32 final_levels[] = {
-	0x01000098,
-	0x0100009f,
-	0x010000af,
-	0, /* T-101 terminator */
-};
-
-/*
- * Check the current patch level on this CPU.
- *
- * @rev: Use it to return the patch level. It is set to 0 in the case of
- * error.
- *
- * Returns:
- *  - true: if update should stop
- *  - false: otherwise
- */
-bool check_current_patch_level(u32 *rev, bool early)
-{
-	u32 lvl, dummy, i;
-	bool ret = false;
-	u32 *levels;
-
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
-
-	if (IS_ENABLED(CONFIG_X86_32) && early)
-		levels = (u32 *)__pa_nodebug(&final_levels);
-	else
-		levels = final_levels;
-
-	for (i = 0; levels[i]; i++) {
-		if (lvl == levels[i]) {
-			lvl = 0;
-			ret = true;
-			break;
-		}
-	}
-
-	if (rev)
-		*rev = lvl;
-
-	return ret;
-}
-
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
-	u32 rev, dummy;
-
-	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
-	/* verify patch application was successful */
-	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	if (rev != mc_amd->hdr.patch_id)
-		return -1;
-
-	return 0;
-}
-
-int apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct microcode_amd *mc_amd;
 	struct ucode_cpu_info *uci;
 	struct ucode_patch *p;
-	u32 rev;
+	u32 rev, dummy;
 
 	BUG_ON(raw_smp_processor_id() != cpu);
 
@@ -696,8 +513,7 @@ int apply_microcode_amd(int cpu)
 	mc_amd  = p->data;
 	uci->mc = p->data;
 
-	if (check_current_patch_level(&rev, false))
-		return -1;
+	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
 
 	/* need to apply patch? */
 	if (rev >= mc_amd->hdr.patch_id) {
@@ -860,7 +676,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 	return UCODE_OK;
 }
 
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
 {
 	enum ucode_state ret;
 
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5ce5155f0695..b4a4cd39b358 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
  *	      2006	Shaohua Li <shaohua.li@intel.com>
- *	      2013-2015	Borislav Petkov <bp@alien8.de>
+ *	      2013-2016	Borislav Petkov <bp@alien8.de>
  *
  * X86 CPU microcode early update for Linux:
  *
@@ -39,11 +39,16 @@
 #include <asm/microcode.h>
 #include <asm/processor.h>
 #include <asm/cmdline.h>
+#include <asm/setup.h>
 
-#define MICROCODE_VERSION	"2.01"
+#define DRIVER_VERSION	"2.2"
 
 static struct microcode_ops	*microcode_ops;
-static bool dis_ucode_ldr;
+static bool dis_ucode_ldr = true;
+
+bool initrd_gone;
+
+LIST_HEAD(microcode_cache);
 
 /*
  * Synchronization.
@@ -61,15 +66,47 @@ static DEFINE_MUTEX(microcode_mutex);
 
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 
-/*
- * Operations that are run on a target cpu:
- */
-
 struct cpu_info_ctx {
 	struct cpu_signature	*cpu_sig;
 	int			err;
 };
 
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+	0x01000098,
+	0x0100009f,
+	0x010000af,
+	0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ *  - true: if update should stop
+ *  - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+	u32 lvl, dummy, i;
+	u32 *levels;
+
+	native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+	if (IS_ENABLED(CONFIG_X86_32))
+		levels = (u32 *)__pa_nodebug(&final_levels);
+	else
+		levels = final_levels;
+
+	for (i = 0; levels[i]; i++) {
+		if (lvl == levels[i])
+			return true;
+	}
+	return false;
+}
+
 static bool __init check_loader_disabled_bsp(void)
 {
 	static const char *__dis_opt_str = "dis_ucode_ldr";
@@ -85,8 +122,24 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (cmdline_find_option_bool(cmdline, option))
-		*res = true;
+	if (!have_cpuid_p())
+		return *res;
+
+	/*
+	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+	 * completely accurate as xen pv guests don't see that CPUID bit set but
+	 * that's good enough as they don't land on the BSP path anyway.
+	 */
+	if (native_cpuid_ecx(1) & BIT(31))
+		return *res;
+
+	if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
+		if (amd_check_current_patch_level())
+			return *res;
+	}
+
+	if (cmdline_find_option_bool(cmdline, option) <= 0)
+		*res = false;
 
 	return *res;
 }
@@ -112,26 +165,21 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
 
 void __init load_ucode_bsp(void)
 {
-	int vendor;
-	unsigned int family;
+	unsigned int cpuid_1_eax;
 
 	if (check_loader_disabled_bsp())
 		return;
 
-	if (!have_cpuid_p())
-		return;
+	cpuid_1_eax = native_cpuid_eax(1);
 
-	vendor = x86_cpuid_vendor();
-	family = x86_cpuid_family();
-
-	switch (vendor) {
+	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
-		if (family >= 6)
+		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_bsp();
 		break;
 	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_bsp(family);
+		if (x86_family(cpuid_1_eax) >= 0x10)
+			load_ucode_amd_bsp(cpuid_1_eax);
 		break;
 	default:
 		break;
@@ -149,25 +197,21 @@ static bool check_loader_disabled_ap(void)
 
 void load_ucode_ap(void)
 {
-	int vendor, family;
+	unsigned int cpuid_1_eax;
 
 	if (check_loader_disabled_ap())
 		return;
 
-	if (!have_cpuid_p())
-		return;
+	cpuid_1_eax = native_cpuid_eax(1);
 
-	vendor = x86_cpuid_vendor();
-	family = x86_cpuid_family();
-
-	switch (vendor) {
+	switch (x86_cpuid_vendor()) {
 	case X86_VENDOR_INTEL:
-		if (family >= 6)
+		if (x86_family(cpuid_1_eax) >= 6)
 			load_ucode_intel_ap();
 		break;
 	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_ap();
+		if (x86_family(cpuid_1_eax) >= 0x10)
+			load_ucode_amd_ap(cpuid_1_eax);
 		break;
 	default:
 		break;
@@ -177,21 +221,81 @@ void load_ucode_ap(void)
 static int __init save_microcode_in_initrd(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
+	int ret = -EINVAL;
 
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		if (c->x86 >= 6)
-			return save_microcode_in_initrd_intel();
+			ret = save_microcode_in_initrd_intel();
 		break;
 	case X86_VENDOR_AMD:
 		if (c->x86 >= 0x10)
-			return save_microcode_in_initrd_amd();
+			return save_microcode_in_initrd_amd(cpuid_eax(1));
 		break;
 	default:
 		break;
 	}
 
-	return -EINVAL;
+	initrd_gone = true;
+
+	return ret;
+}
+
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+	unsigned long start = 0;
+	size_t size;
+
+#ifdef CONFIG_X86_32
+	struct boot_params *params;
+
+	if (use_pa)
+		params = (struct boot_params *)__pa_nodebug(&boot_params);
+	else
+		params = &boot_params;
+
+	size = params->hdr.ramdisk_size;
+
+	/*
+	 * Set start only if we have an initrd image. We cannot use initrd_start
+	 * because it is not set that early yet.
+	 */
+	if (size)
+		start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+	size  = (unsigned long)boot_params.ext_ramdisk_size << 32;
+	size |= boot_params.hdr.ramdisk_size;
+
+	if (size) {
+		start  = (unsigned long)boot_params.ext_ramdisk_image << 32;
+		start |= boot_params.hdr.ramdisk_image;
+
+		start += PAGE_OFFSET;
+	}
+# endif
+
+	/*
+	 * Fixup the start address: after reserve_initrd() runs, initrd_start
+	 * has the virtual address of the beginning of the initrd. It also
+	 * possibly relocates the ramdisk. In either case, initrd_start contains
+	 * the updated address so use that instead.
+	 *
+	 * initrd_gone is for the hotplug case where we've thrown out initrd
+	 * already.
+	 */
+	if (!use_pa) {
+		if (initrd_gone)
+			return (struct cpio_data){ NULL, 0, "" };
+		if (initrd_start)
+			start = initrd_start;
+	}
+
+	return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+	return (struct cpio_data){ NULL, 0, "" };
+#endif
 }
 
 void reload_early_microcode(void)
@@ -453,16 +557,17 @@ static struct attribute_group mc_attr_group = {
 
 static void microcode_fini_cpu(int cpu)
 {
-	microcode_ops->microcode_fini_cpu(cpu);
+	if (microcode_ops->microcode_fini_cpu)
+		microcode_ops->microcode_fini_cpu(cpu);
 }
 
 static enum ucode_state microcode_resume_cpu(int cpu)
 {
-	pr_debug("CPU%d updated upon resume\n", cpu);
-
 	if (apply_microcode_on_target(cpu))
 		return UCODE_ERROR;
 
+	pr_debug("CPU%d updated upon resume\n", cpu);
+
 	return UCODE_OK;
 }
 
@@ -496,6 +601,9 @@ static enum ucode_state microcode_update_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
+	/* Refresh CPU microcode revision after resume. */
+	collect_cpu_info(cpu);
+
 	if (uci->valid)
 		return microcode_resume_cpu(cpu);
 
@@ -579,12 +687,7 @@ static int mc_cpu_down_prep(unsigned int cpu)
 	/* Suspend is in progress, only remove the interface */
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
 	pr_debug("CPU%d removed\n", cpu);
-	/*
-	 * When a CPU goes offline, don't free up or invalidate the copy of
-	 * the microcode in kernel memory, so that we can reuse it when the
-	 * CPU comes back online without unnecessarily requesting the userspace
-	 * for it again.
-	 */
+
 	return 0;
 }
 
@@ -649,8 +752,7 @@ int __init microcode_init(void)
 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
 				  mc_cpu_online, mc_cpu_down_prep);
 
-	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
-		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+	pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
 
 	return 0;
 
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..8325d8a09ab0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,125 +39,83 @@
 #include <asm/setup.h>
 #include <asm/msr.h>
 
-/*
- * Temporary microcode blobs pointers storage. We note here during early load
- * the pointers to microcode blobs we've got from whatever storage (detached
- * initrd, builtin). Later on, we put those into final storage
- * mc_saved_data.mc_saved.
- *
- * Important: those are offsets from the beginning of initrd or absolute
- * addresses within the kernel image when built-in.
- */
-static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
-
-static struct mc_saved_data {
-	unsigned int num_saved;
-	struct microcode_intel **mc_saved;
-} mc_saved_data;
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
 
-/* Microcode blobs within the initrd. 0 if builtin. */
-static struct ucode_blobs {
-	unsigned long start;
-	bool valid;
-} blobs;
+/* Current microcode patch used in early patching on the APs. */
+struct microcode_intel *intel_ucode_patch;
 
-/* Go through saved patches and find the one suitable for the current CPU. */
-static enum ucode_state
-find_microcode_patch(struct microcode_intel **saved,
-		     unsigned int num_saved, struct ucode_cpu_info *uci)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+					unsigned int s2, unsigned int p2)
 {
-	struct microcode_intel *ucode_ptr, *new_mc = NULL;
-	struct microcode_header_intel *mc_hdr;
-	int new_rev, ret, i;
-
-	new_rev = uci->cpu_sig.rev;
-
-	for (i = 0; i < num_saved; i++) {
-		ucode_ptr = saved[i];
-		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
-
-		ret = has_newer_microcode(ucode_ptr,
-					  uci->cpu_sig.sig,
-					  uci->cpu_sig.pf,
-					  new_rev);
-		if (!ret)
-			continue;
-
-		new_rev = mc_hdr->rev;
-		new_mc  = ucode_ptr;
-	}
+	if (s1 != s2)
+		return false;
 
-	if (!new_mc)
-		return UCODE_NFOUND;
+	/* Processor flags are either both 0 ... */
+	if (!p1 && !p2)
+		return true;
 
-	uci->mc = (struct microcode_intel *)new_mc;
-	return UCODE_OK;
+	/* ... or they intersect. */
+	return p1 & p2;
 }
 
-static inline void
-copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
-	  unsigned long off, int num_saved)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
 {
+	struct microcode_header_intel *mc_hdr = mc;
+	struct extended_sigtable *ext_hdr;
+	struct extended_signature *ext_sig;
 	int i;
 
-	for (i = 0; i < num_saved; i++)
-		mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
-{
-	int i;
-	struct microcode_intel ***mc_saved;
+	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+		return 1;
 
-	mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+	/* Look for ext. headers: */
+	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+		return 0;
 
-	for (i = 0; i < mcs->num_saved; i++) {
-		struct microcode_intel *p;
+	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
 
-		p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
-		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+	for (i = 0; i < ext_hdr->count; i++) {
+		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+			return 1;
+		ext_sig++;
 	}
+	return 0;
 }
-#endif
 
-static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-	       unsigned long offset, struct ucode_cpu_info *uci)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
 {
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int count = mcs->num_saved;
+	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!mcs->mc_saved) {
-		copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
+	if (mc_hdr->rev <= new_rev)
+		return 0;
 
-		return find_microcode_patch(mc_saved_tmp, count, uci);
-	} else {
-#ifdef CONFIG_X86_32
-		microcode_phys(mc_saved_tmp, mcs);
-		return find_microcode_patch(mc_saved_tmp, count, uci);
-#else
-		return find_microcode_patch(mcs->mc_saved, count, uci);
-#endif
-	}
+	return find_matching_signature(mc, csig, cpf);
 }
 
 /*
  * Given CPU signature and a microcode patch, this function finds if the
  * microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
  */
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
-			unsigned long sig)
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+			      unsigned long sig)
 {
-	unsigned int fam, model;
-	unsigned int fam_ucode, model_ucode;
-	struct extended_sigtable *ext_header;
 	unsigned long total_size = get_totalsize(mc_header);
 	unsigned long data_size = get_datasize(mc_header);
-	int ext_sigcount, i;
+	struct extended_sigtable *ext_header;
+	unsigned int fam_ucode, model_ucode;
 	struct extended_signature *ext_sig;
+	unsigned int fam, model;
+	int ext_sigcount, i;
 
 	fam   = x86_family(sig);
 	model = x86_model(sig);
@@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 	model_ucode = x86_model(mc_header->sig);
 
 	if (fam == fam_ucode && model == model_ucode)
-		return UCODE_OK;
+		return true;
 
 	/* Look for ext. headers: */
 	if (total_size <= data_size + MC_HEADER_SIZE)
-		return UCODE_NFOUND;
+		return false;
 
 	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
 	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
@@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 		model_ucode = x86_model(ext_sig->sig);
 
 		if (fam == fam_ucode && model == model_ucode)
-			return UCODE_OK;
+			return true;
 
 		ext_sig++;
 	}
-	return UCODE_NFOUND;
+	return false;
 }
 
-static int
-save_microcode(struct mc_saved_data *mcs,
-	       struct microcode_intel **mc_saved_src,
-	       unsigned int num_saved)
+static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
 {
-	int i, j;
-	struct microcode_intel **saved_ptr;
-	int ret;
+	struct ucode_patch *p;
 
-	if (!num_saved)
-		return -EINVAL;
+	p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
 
-	/*
-	 * Copy new microcode data.
-	 */
-	saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
-	if (!saved_ptr)
-		return -ENOMEM;
-
-	for (i = 0; i < num_saved; i++) {
-		struct microcode_header_intel *mc_hdr;
-		struct microcode_intel *mc;
-		unsigned long size;
-
-		if (!mc_saved_src[i]) {
-			ret = -EINVAL;
-			goto err;
-		}
+	p->data = kmemdup(data, size, GFP_KERNEL);
+	if (!p->data) {
+		kfree(p);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return p;
+}
+
+static void save_microcode_patch(void *data, unsigned int size)
+{
+	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+	struct ucode_patch *iter, *tmp, *p;
+	bool prev_found = false;
+	unsigned int sig, pf;
+
+	mc_hdr = (struct microcode_header_intel *)data;
+
+	list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+		mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+		sig	     = mc_saved_hdr->sig;
+		pf	     = mc_saved_hdr->pf;
+
+		if (find_matching_signature(data, sig, pf)) {
+			prev_found = true;
 
-		mc     = mc_saved_src[i];
-		mc_hdr = &mc->hdr;
-		size   = get_totalsize(mc_hdr);
+			if (mc_hdr->rev <= mc_saved_hdr->rev)
+				continue;
 
-		saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
-		if (!saved_ptr[i]) {
-			ret = -ENOMEM;
-			goto err;
+			p = __alloc_microcode_buf(data, size);
+			if (IS_ERR(p))
+				pr_err("Error allocating buffer %p\n", data);
+			else
+				list_replace(&iter->plist, &p->plist);
 		}
 	}
 
 	/*
-	 * Point to newly saved microcode.
+	 * There weren't any previous patches found in the list cache; save the
+	 * newly found.
 	 */
-	mcs->mc_saved  = saved_ptr;
-	mcs->num_saved = num_saved;
-
-	return 0;
-
-err:
-	for (j = 0; j <= i; j++)
-		kfree(saved_ptr[j]);
-	kfree(saved_ptr);
-
-	return ret;
+	if (!prev_found) {
+		p = __alloc_microcode_buf(data, size);
+		if (IS_ERR(p))
+			pr_err("Error allocating buffer for %p\n", data);
+		else
+			list_add_tail(&p->plist, &microcode_cache);
+	}
 }
 
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- *   patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
-			     u8 *ucode_ptr, unsigned int num_saved)
+static int microcode_sanity_check(void *mc, int print_err)
 {
-	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-	unsigned int sig, pf;
-	int found = 0, i;
+	unsigned long total_size, data_size, ext_table_size;
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	u32 sum, orig_sum, ext_sigcount = 0, i;
+	struct extended_signature *ext_sig;
 
-	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
 
-	for (i = 0; i < num_saved; i++) {
-		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
-		sig	     = mc_saved_hdr->sig;
-		pf	     = mc_saved_hdr->pf;
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		if (print_err)
+			pr_err("Error: bad microcode data file size.\n");
+		return -EINVAL;
+	}
 
-		if (!find_matching_signature(ucode_ptr, sig, pf))
-			continue;
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		if (print_err)
+			pr_err("Error: invalid/unknown microcode update format.\n");
+		return -EINVAL;
+	}
 
-		found = 1;
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		u32 ext_table_sum = 0;
+		u32 *ext_tablep;
 
-		if (mc_hdr->rev <= mc_saved_hdr->rev)
-			continue;
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			if (print_err)
+				pr_err("Error: truncated extended signature table.\n");
+			return -EINVAL;
+		}
+
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			if (print_err)
+				pr_err("Error: extended signature table size mismatch.\n");
+			return -EFAULT;
+		}
+
+		ext_sigcount = ext_header->count;
 
 		/*
-		 * Found an older ucode saved earlier. Replace it with
-		 * this newer one.
+		 * Check extended table checksum: the sum of all dwords that
+		 * comprise a valid table must be 0.
 		 */
-		mc_saved[i] = (struct microcode_intel *)ucode_ptr;
-		break;
+		ext_tablep = (u32 *)ext_header;
+
+		i = ext_table_size / sizeof(u32);
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+
+		if (ext_table_sum) {
+			if (print_err)
+				pr_warn("Bad extended signature table checksum, aborting.\n");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Calculate the checksum of update data and header. The checksum of
+	 * valid update data and header including the extended signature table
+	 * must be 0.
+	 */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+	while (i--)
+		orig_sum += ((u32 *)mc)[i];
+
+	if (orig_sum) {
+		if (print_err)
+			pr_err("Bad microcode data checksum, aborting.\n");
+		return -EINVAL;
 	}
 
-	/* Newly detected microcode, save it to memory. */
-	if (i >= num_saved && !found)
-		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+	if (!ext_table_size)
+		return 0;
 
-	return num_saved;
+	/*
+	 * Check extended signature checksum: 0 => valid.
+	 */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+			  EXT_SIGNATURE_SIZE * i;
+
+		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			if (print_err)
+				pr_err("Bad extended signature checksum, aborting.\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
 }
 
 /*
  * Get microcode matching with BSP's model. Only CPUs with the same model as
  * BSP can stay in the platform.
  */
-static enum ucode_state __init
-get_matching_model_microcode(unsigned long start, void *data, size_t size,
-			     struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-			     struct ucode_cpu_info *uci)
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
 {
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	struct microcode_header_intel *mc_header;
-	unsigned int num_saved = mcs->num_saved;
-	enum ucode_state state = UCODE_OK;
-	unsigned int leftover = size;
-	u8 *ucode_ptr = data;
+	struct microcode_intel *patch = NULL;
 	unsigned int mc_size;
-	int i;
-
-	while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
 
-		if (leftover < sizeof(mc_header))
+	while (size) {
+		if (size < sizeof(struct microcode_header_intel))
 			break;
 
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
+		mc_header = (struct microcode_header_intel *)data;
 
 		mc_size = get_totalsize(mc_header);
-		if (!mc_size || mc_size > leftover ||
-			microcode_sanity_check(ucode_ptr, 0) < 0)
+		if (!mc_size ||
+		    mc_size > size ||
+		    microcode_sanity_check(data, 0) < 0)
 			break;
 
-		leftover -= mc_size;
+		size -= mc_size;
 
-		/*
-		 * Since APs with same family and model as the BSP may boot in
-		 * the platform, we need to find and save microcode patches
-		 * with the same family and model as the BSP.
-		 */
-		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
-			ucode_ptr += mc_size;
+		if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+			data += mc_size;
 			continue;
 		}
 
-		num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
+		if (save) {
+			save_microcode_patch(data, mc_size);
+			goto next;
+		}
 
-		ucode_ptr += mc_size;
-	}
 
-	if (leftover) {
-		state = UCODE_ERROR;
-		return state;
-	}
+		if (!patch) {
+			if (!has_newer_microcode(data,
+						 uci->cpu_sig.sig,
+						 uci->cpu_sig.pf,
+						 uci->cpu_sig.rev))
+				goto next;
 
-	if (!num_saved) {
-		state = UCODE_NFOUND;
-		return state;
-	}
+		} else {
+			struct microcode_header_intel *phdr = &patch->hdr;
+
+			if (!has_newer_microcode(data,
+						 phdr->sig,
+						 phdr->pf,
+						 phdr->rev))
+				goto next;
+		}
 
-	for (i = 0; i < num_saved; i++)
-		mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+		/* We have a newer patch, save it. */
+		patch = data;
 
-	mcs->num_saved = num_saved;
+next:
+		data += mc_size;
+	}
+
+	if (size)
+		return NULL;
 
-	return state;
+	return patch;
 }
 
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
 	unsigned int val[2];
 	unsigned int family, model;
-	struct cpu_signature csig;
+	struct cpu_signature csig = { 0 };
 	unsigned int eax, ebx, ecx, edx;
 
-	csig.sig = 0;
-	csig.pf = 0;
-	csig.rev = 0;
-
 	memset(uci, 0, sizeof(*uci));
 
 	eax = 0x00000001;
@@ -374,23 +382,16 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	csig.sig = eax;
 
-	family = x86_family(csig.sig);
-	model  = x86_model(csig.sig);
+	family = x86_family(eax);
+	model  = x86_model(eax);
 
 	if ((model >= 5) || (family > 6)) {
 		/* get processor flags from MSR 0x17 */
 		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
 		csig.pf = 1 << ((val[1] >> 18) & 7);
 	}
-	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
 
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-	csig.rev = val[1];
+	csig.rev = intel_get_microcode_revision();
 
 	uci->cpu_sig = csig;
 	uci->valid = 1;
@@ -401,40 +402,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 static void show_saved_mc(void)
 {
 #ifdef DEBUG
-	int i, j;
+	int i = 0, j;
 	unsigned int sig, pf, rev, total_size, data_size, date;
 	struct ucode_cpu_info uci;
+	struct ucode_patch *p;
 
-	if (!mc_saved_data.num_saved) {
+	if (list_empty(&microcode_cache)) {
 		pr_debug("no microcode data saved.\n");
 		return;
 	}
-	pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
 
 	collect_cpu_info_early(&uci);
 
-	sig = uci.cpu_sig.sig;
-	pf = uci.cpu_sig.pf;
-	rev = uci.cpu_sig.rev;
+	sig	= uci.cpu_sig.sig;
+	pf	= uci.cpu_sig.pf;
+	rev	= uci.cpu_sig.rev;
 	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
-	for (i = 0; i < mc_saved_data.num_saved; i++) {
+	list_for_each_entry(p, &microcode_cache, plist) {
 		struct microcode_header_intel *mc_saved_header;
 		struct extended_sigtable *ext_header;
-		int ext_sigcount;
 		struct extended_signature *ext_sig;
+		int ext_sigcount;
 
-		mc_saved_header = (struct microcode_header_intel *)
-				  mc_saved_data.mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		rev = mc_saved_header->rev;
-		total_size = get_totalsize(mc_saved_header);
-		data_size = get_datasize(mc_saved_header);
-		date = mc_saved_header->date;
+		mc_saved_header = (struct microcode_header_intel *)p->data;
+
+		sig	= mc_saved_header->sig;
+		pf	= mc_saved_header->pf;
+		rev	= mc_saved_header->rev;
+		date	= mc_saved_header->date;
+
+		total_size	= get_totalsize(mc_saved_header);
+		data_size	= get_datasize(mc_saved_header);
 
 		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
-			 i, sig, pf, rev, total_size,
+			 i++, sig, pf, rev, total_size,
 			 date & 0xffff,
 			 date >> 24,
 			 (date >> 16) & 0xff);
@@ -443,7 +445,7 @@ static void show_saved_mc(void)
 		if (total_size <= data_size + MC_HEADER_SIZE)
 			continue;
 
-		ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+		ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
 		ext_sigcount = ext_header->count;
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
@@ -456,85 +458,43 @@ static void show_saved_mc(void)
 
 			ext_sig++;
 		}
-
 	}
 #endif
 }
 
 /*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
  */
-static void save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc, unsigned int size)
 {
 #ifdef CONFIG_HOTPLUG_CPU
 	/* Synchronization during CPU hotplug. */
 	static DEFINE_MUTEX(x86_cpu_microcode_mutex);
 
-	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
-	unsigned int mc_saved_count_init;
-	unsigned int num_saved;
-	struct microcode_intel **mc_saved;
-	int ret, i;
-
 	mutex_lock(&x86_cpu_microcode_mutex);
 
-	mc_saved_count_init = mc_saved_data.num_saved;
-	num_saved = mc_saved_data.num_saved;
-	mc_saved = mc_saved_data.mc_saved;
-
-	if (mc_saved && num_saved)
-		memcpy(mc_saved_tmp, mc_saved,
-		       num_saved * sizeof(struct microcode_intel *));
-	/*
-	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
-	 * version.
-	 */
-	num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
-
-	/*
-	 * Save the mc_save_tmp in global mc_saved_data.
-	 */
-	ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
-	if (ret) {
-		pr_err("Cannot save microcode patch.\n");
-		goto out;
-	}
-
+	save_microcode_patch(mc, size);
 	show_saved_mc();
 
-	/*
-	 * Free old saved microcode data.
-	 */
-	if (mc_saved) {
-		for (i = 0; i < mc_saved_count_init; i++)
-			kfree(mc_saved[i]);
-		kfree(mc_saved);
-	}
-
-out:
 	mutex_unlock(&x86_cpu_microcode_mutex);
 #endif
 }
 
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
 {
-#ifdef CONFIG_X86_64
-	unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+	unsigned int eax = 1, ebx, ecx = 0, edx;
 	char name[30];
 
+	if (IS_ENABLED(CONFIG_X86_32))
+		return false;
+
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		      x86_family(eax), x86_model(eax), x86_stepping(eax));
 
 	return get_builtin_firmware(cp, name);
-#else
-	return false;
-#endif
 }
 
 /*
@@ -570,8 +530,7 @@ void show_ucode_info_early(void)
 }
 
 /*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
+ * At this point, we can not call printk() yet. Delay printing microcode info in
  * show_ucode_info_early() until printk() works.
  */
 static void print_ucode(struct ucode_cpu_info *uci)
@@ -616,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
 static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 {
 	struct microcode_intel *mc;
-	unsigned int val[2];
+	u32 rev;
 
 	mc = uci->mc;
 	if (!mc)
@@ -624,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 
 	/* write microcode via MSR 0x79 */
 	native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-	native_wrmsrl(MSR_IA32_UCODE_REV, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
 
-	/* get the current revision from MSR 0x8B */
-	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-	if (val[1] != mc->hdr.rev)
+	rev = intel_get_microcode_revision();
+	if (rev != mc->hdr.rev)
 		return -1;
 
 #ifdef CONFIG_X86_64
 	/* Flush global tlb. This is precaution. */
 	flush_tlb_early();
 #endif
-	uci->cpu_sig.rev = val[1];
+	uci->cpu_sig.rev = rev;
 
 	if (early)
 		print_ucode(uci);
@@ -648,206 +602,133 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	return 0;
 }
 
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
- */
 int __init save_microcode_in_initrd_intel(void)
 {
-	struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
-	unsigned int count = mc_saved_data.num_saved;
-	unsigned long offset = 0;
-	int ret;
+	struct ucode_cpu_info uci;
+	struct cpio_data cp;
+
+	if (!load_builtin_intel_microcode(&cp))
+		cp = find_microcode_in_initrd(ucode_path, false);
 
-	if (!count)
+	if (!(cp.data && cp.size))
 		return 0;
 
-	/*
-	 * We have found a valid initrd but it might've been relocated in the
-	 * meantime so get its updated address.
-	 */
-	if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
-		offset = initrd_start;
+	collect_cpu_info_early(&uci);
 
-	copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
+	scan_microcode(cp.data, cp.size, &uci, true);
 
-	ret = save_microcode(&mc_saved_data, mc_saved, count);
-	if (ret)
-		pr_err("Cannot save microcode patches from initrd.\n");
-	else
-		show_saved_mc();
+	show_saved_mc();
 
-	return ret;
+	return 0;
 }
 
-static __init enum ucode_state
-__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
 {
-#ifdef CONFIG_BLK_DEV_INITRD
-	static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
-	char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
-						    : ucode_name;
-# ifdef CONFIG_X86_32
-	unsigned long start = 0, size;
-	struct boot_params *params;
-
-	params = (struct boot_params *)__pa_nodebug(&boot_params);
-	size   = params->hdr.ramdisk_size;
-
-	/*
-	 * Set start only if we have an initrd image. We cannot use initrd_start
-	 * because it is not set that early yet.
-	 */
-	start = (size ? params->hdr.ramdisk_image : 0);
-
-# else /* CONFIG_X86_64 */
-	unsigned long start = 0, size;
-
-	size  = (u64)boot_params.ext_ramdisk_size << 32;
-	size |= boot_params.hdr.ramdisk_size;
-
-	if (size) {
-		start  = (u64)boot_params.ext_ramdisk_image << 32;
-		start |= boot_params.hdr.ramdisk_image;
+	static const char *path;
+	struct cpio_data cp;
+	bool use_pa;
 
-		start += PAGE_OFFSET;
+	if (IS_ENABLED(CONFIG_X86_32)) {
+		path	  = (const char *)__pa_nodebug(ucode_path);
+		use_pa	  = true;
+	} else {
+		path	  = ucode_path;
+		use_pa	  = false;
 	}
-# endif
-
-	*cd = find_cpio_data(p, (void *)start, size, NULL);
-	if (cd->data) {
-		blbp->start = start;
-		blbp->valid = true;
 
-		return UCODE_OK;
-	} else
-#endif /* CONFIG_BLK_DEV_INITRD */
-		return UCODE_ERROR;
-}
+	/* try built-in microcode first */
+	if (!load_builtin_intel_microcode(&cp))
+		cp = find_microcode_in_initrd(path, use_pa);
 
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-	       struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
-{
-	struct cpio_data cd = { NULL, 0, "" };
-	enum ucode_state ret;
+	if (!(cp.data && cp.size))
+		return NULL;
 
-	/* try built-in microcode first */
-	if (load_builtin_intel_microcode(&cd))
-		/*
-		 * Invalidate blobs as we might've gotten an initrd too,
-		 * supplied by the boot loader, by mistake or simply forgotten
-		 * there. That's fine, we ignore it since we've found builtin
-		 * microcode already.
-		 */
-		blbp->valid = false;
-	else {
-		ret = __scan_microcode_initrd(&cd, blbp);
-		if (ret != UCODE_OK)
-			return ret;
-	}
+	collect_cpu_info_early(uci);
 
-	return get_matching_model_microcode(blbp->start, cd.data, cd.size,
-					    mcs, mc_ptrs, uci);
+	return scan_microcode(cp.data, cp.size, uci, false);
 }
 
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
-		      struct ucode_blobs *blbp)
+void __init load_ucode_intel_bsp(void)
 {
+	struct microcode_intel *patch;
 	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-	collect_cpu_info_early(&uci);
 
-	ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
-	if (ret != UCODE_OK)
+	patch = __load_ucode_intel(&uci);
+	if (!patch)
 		return;
 
-	ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
-	if (ret != UCODE_OK)
-		return;
+	uci.mc = patch;
 
 	apply_microcode_early(&uci, true);
 }
 
-void __init load_ucode_intel_bsp(void)
+void load_ucode_intel_ap(void)
 {
-	struct ucode_blobs *blobs_p;
-	struct mc_saved_data *mcs;
-	unsigned long *ptrs;
+	struct microcode_intel *patch, **iup;
+	struct ucode_cpu_info uci;
 
-#ifdef CONFIG_X86_32
-	mcs	= (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	ptrs	= (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
-	blobs_p	= (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
-	mcs	= &mc_saved_data;
-	ptrs	= mc_tmp_ptrs;
-	blobs_p = &blobs;
-#endif
+	if (IS_ENABLED(CONFIG_X86_32))
+		iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+	else
+		iup = &intel_ucode_patch;
+
+reget:
+	if (!*iup) {
+		patch = __load_ucode_intel(&uci);
+		if (!patch)
+			return;
+
+		*iup = patch;
+	}
+
+	uci.mc = *iup;
 
-	_load_ucode_intel_bsp(mcs, ptrs, blobs_p);
+	if (apply_microcode_early(&uci, true)) {
+		/* Mixed-silicon system? Try to refetch the proper patch: */
+		*iup = NULL;
+
+		goto reget;
+	}
 }
 
-void load_ucode_intel_ap(void)
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
 {
-	struct ucode_blobs *blobs_p;
-	unsigned long *ptrs, start = 0;
-	struct mc_saved_data *mcs;
-	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-#ifdef CONFIG_X86_32
-	mcs	= (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
-	ptrs	= (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
-	blobs_p	= (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
-	mcs	= &mc_saved_data;
-	ptrs	= mc_tmp_ptrs;
-	blobs_p = &blobs;
-#endif
+	struct microcode_header_intel *phdr;
+	struct ucode_patch *iter, *tmp;
 
-	/*
-	 * If there is no valid ucode previously saved in memory, no need to
-	 * update ucode on this AP.
-	 */
-	if (!mcs->num_saved)
-		return;
+	list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
 
-	if (blobs_p->valid) {
-		start = blobs_p->start;
+		phdr = (struct microcode_header_intel *)iter->data;
 
-		/*
-		 * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
-		 * physmem mapping too and there we have the initrd.
-		 */
-		start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
-	}
+		if (phdr->rev <= uci->cpu_sig.rev)
+			continue;
 
-	collect_cpu_info_early(&uci);
-	ret = load_microcode(mcs, ptrs, start, &uci);
-	if (ret != UCODE_OK)
-		return;
+		if (!find_matching_signature(phdr,
+					     uci->cpu_sig.sig,
+					     uci->cpu_sig.pf))
+			continue;
 
-	apply_microcode_early(&uci, true);
+		return iter->data;
+	}
+	return NULL;
 }
 
 void reload_ucode_intel(void)
 {
+	struct microcode_intel *p;
 	struct ucode_cpu_info uci;
-	enum ucode_state ret;
-
-	if (!mc_saved_data.num_saved)
-		return;
 
 	collect_cpu_info_early(&uci);
 
-	ret = find_microcode_patch(mc_saved_data.mc_saved,
-				   mc_saved_data.num_saved, &uci);
-	if (ret != UCODE_OK)
+	p = find_patch(&uci);
+	if (!p)
 		return;
 
+	uci.mc = p;
+
 	apply_microcode_early(&uci, false);
 }
 
@@ -879,31 +760,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 	return 0;
 }
 
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc, int cpu)
-{
-	struct cpu_signature cpu_sig;
-	unsigned int csig, cpf, crev;
-
-	collect_cpu_info(cpu, &cpu_sig);
-
-	csig = cpu_sig.sig;
-	cpf = cpu_sig.pf;
-	crev = cpu_sig.rev;
-
-	return has_newer_microcode(mc, csig, cpf, crev);
-}
-
 static int apply_microcode_intel(int cpu)
 {
 	struct microcode_intel *mc;
 	struct ucode_cpu_info *uci;
 	struct cpuinfo_x86 *c;
-	unsigned int val[2];
 	static int prev_rev;
+	u32 rev;
 
 	/* We should bind the task to the CPU */
 	if (WARN_ON(raw_smp_processor_id() != cpu))
@@ -911,46 +774,37 @@ static int apply_microcode_intel(int cpu)
 
 	uci = ucode_cpu_info + cpu;
 	mc = uci->mc;
-	if (!mc)
-		return 0;
-
-	/*
-	 * Microcode on this CPU could be updated earlier. Only apply the
-	 * microcode patch in mc when it is newer than the one on this
-	 * CPU.
-	 */
-	if (!get_matching_mc(mc, cpu))
-		return 0;
+	if (!mc) {
+		/* Look for a newer patch in our cache: */
+		mc = find_patch(uci);
+		if (!mc)
+			return 0;
+	}
 
 	/* write microcode via MSR 0x79 */
 	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
-	wrmsrl(MSR_IA32_UCODE_REV, 0);
-
-	/* As documented in the SDM: Do a CPUID 1 here */
-	sync_core();
 
-	/* get the current revision from MSR 0x8B */
-	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+	rev = intel_get_microcode_revision();
 
-	if (val[1] != mc->hdr.rev) {
+	if (rev != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
 		       cpu, mc->hdr.rev);
 		return -1;
 	}
 
-	if (val[1] != prev_rev) {
+	if (rev != prev_rev) {
 		pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
-			val[1],
+			rev,
 			mc->hdr.date & 0xffff,
 			mc->hdr.date >> 24,
 			(mc->hdr.date >> 16) & 0xff);
-		prev_rev = val[1];
+		prev_rev = rev;
 	}
 
 	c = &cpu_data(cpu);
 
-	uci->cpu_sig.rev = val[1];
-	c->microcode = val[1];
+	uci->cpu_sig.rev = rev;
+	c->microcode = rev;
 
 	return 0;
 }
@@ -962,8 +816,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
-	enum ucode_state state = UCODE_OK;
-	unsigned int curr_mc_size = 0;
+	unsigned int curr_mc_size = 0, new_mc_size = 0;
 	unsigned int csig, cpf;
 
 	while (leftover) {
@@ -1004,6 +857,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 			vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
+			new_mc_size = mc_size;
 			mc = NULL;	/* trigger new vmalloc */
 		}
 
@@ -1015,14 +869,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 	if (leftover) {
 		vfree(new_mc);
-		state = UCODE_ERROR;
-		goto out;
+		return UCODE_ERROR;
 	}
 
-	if (!new_mc) {
-		state = UCODE_NFOUND;
-		goto out;
-	}
+	if (!new_mc)
+		return UCODE_NFOUND;
 
 	vfree(uci->mc);
 	uci->mc = (struct microcode_intel *)new_mc;
@@ -1032,12 +883,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 	 * permanent memory. So it will be loaded early when a CPU is hot added
 	 * or resumes.
 	 */
-	save_mc_for_early(new_mc);
+	save_mc_for_early(new_mc, new_mc_size);
 
 	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
 		 cpu, new_rev, uci->cpu_sig.rev);
-out:
-	return state;
+
+	return UCODE_OK;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -1081,20 +932,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size)
 	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
-static void microcode_fini_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	vfree(uci->mc);
-	uci->mc = NULL;
-}
-
 static struct microcode_ops microcode_intel_ops = {
 	.request_microcode_user		  = request_microcode_user,
 	.request_microcode_fw             = request_microcode_fw,
 	.collect_cpu_info                 = collect_cpu_info,
 	.apply_microcode                  = apply_microcode_intel,
-	.microcode_fini_cpu               = microcode_fini_cpu,
 };
 
 struct microcode_ops * __init init_intel_microcode(void)
@@ -1109,4 +951,3 @@ struct microcode_ops * __init init_intel_microcode(void)
 
 	return &microcode_intel_ops;
 }
-
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
deleted file mode 100644
index 406cb6c0d9dd..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- *			   H Peter Anvin" <hpa@zytor.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
-					unsigned int s2, unsigned int p2)
-{
-	if (s1 != s2)
-		return false;
-
-	/* Processor flags are either both 0 ... */
-	if (!p1 && !p2)
-		return true;
-
-	/* ... or they intersect. */
-	return p1 & p2;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
-	unsigned long total_size, data_size, ext_table_size;
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header = NULL;
-	u32 sum, orig_sum, ext_sigcount = 0, i;
-	struct extended_signature *ext_sig;
-
-	total_size = get_totalsize(mc_header);
-	data_size = get_datasize(mc_header);
-
-	if (data_size + MC_HEADER_SIZE > total_size) {
-		if (print_err)
-			pr_err("Error: bad microcode data file size.\n");
-		return -EINVAL;
-	}
-
-	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-		if (print_err)
-			pr_err("Error: invalid/unknown microcode update format.\n");
-		return -EINVAL;
-	}
-
-	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-	if (ext_table_size) {
-		u32 ext_table_sum = 0;
-		u32 *ext_tablep;
-
-		if ((ext_table_size < EXT_HEADER_SIZE)
-		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-			if (print_err)
-				pr_err("Error: truncated extended signature table.\n");
-			return -EINVAL;
-		}
-
-		ext_header = mc + MC_HEADER_SIZE + data_size;
-		if (ext_table_size != exttable_size(ext_header)) {
-			if (print_err)
-				pr_err("Error: extended signature table size mismatch.\n");
-			return -EFAULT;
-		}
-
-		ext_sigcount = ext_header->count;
-
-		/*
-		 * Check extended table checksum: the sum of all dwords that
-		 * comprise a valid table must be 0.
-		 */
-		ext_tablep = (u32 *)ext_header;
-
-		i = ext_table_size / sizeof(u32);
-		while (i--)
-			ext_table_sum += ext_tablep[i];
-
-		if (ext_table_sum) {
-			if (print_err)
-				pr_warn("Bad extended signature table checksum, aborting.\n");
-			return -EINVAL;
-		}
-	}
-
-	/*
-	 * Calculate the checksum of update data and header. The checksum of
-	 * valid update data and header including the extended signature table
-	 * must be 0.
-	 */
-	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
-	while (i--)
-		orig_sum += ((u32 *)mc)[i];
-
-	if (orig_sum) {
-		if (print_err)
-			pr_err("Bad microcode data checksum, aborting.\n");
-		return -EINVAL;
-	}
-
-	if (!ext_table_size)
-		return 0;
-
-	/*
-	 * Check extended signature checksum: 0 => valid.
-	 */
-	for (i = 0; i < ext_sigcount; i++) {
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-			  EXT_SIGNATURE_SIZE * i;
-
-		sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
-		      (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-		if (sum) {
-			if (print_err)
-				pr_err("Bad extended signature checksum, aborting.\n");
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
-	struct microcode_header_intel *mc_hdr = mc;
-	struct extended_sigtable *ext_hdr;
-	struct extended_signature *ext_sig;
-	int i;
-
-	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
-		return 1;
-
-	/* Look for ext. headers: */
-	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
-		return 0;
-
-	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
-	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
-	for (i = 0; i < ext_hdr->count; i++) {
-		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
-			return 1;
-		ext_sig++;
-	}
-	return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
-{
-	struct microcode_header_intel *mc_hdr = mc;
-
-	if (mc_hdr->rev <= new_rev)
-		return 0;
-
-	return find_matching_signature(mc, csig, cpf);
-}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f44c5a50ab8..b5375b9497b3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -25,12 +25,12 @@
 #include <asm/hyperv.h>
 #include <asm/mshyperv.h>
 #include <asm/desc.h>
-#include <asm/idle.h>
 #include <asm/irq_regs.h>
 #include <asm/i8259.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/reboot.h>
+#include <asm/nmi.h>
 
 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -133,33 +133,38 @@ static uint32_t  __init ms_hyperv_platform(void)
 	return 0;
 }
 
-static cycle_t read_hv_clock(struct clocksource *arg)
+static unsigned char hv_get_nmi_reason(void)
 {
-	cycle_t current_tick;
-	/*
-	 * Read the partition counter to get the current tick count. This count
-	 * is set to 0 when the partition is created and is incremented in
-	 * 100 nanosecond units.
-	 */
-	rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
-	return current_tick;
+	return 0;
 }
 
-static struct clocksource hyperv_cs = {
-	.name		= "hyperv_clocksource",
-	.rating		= 400, /* use this when running on Hyperv*/
-	.read		= read_hv_clock,
-	.mask		= CLOCKSOURCE_MASK(64),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static unsigned char hv_get_nmi_reason(void)
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
 {
-	return 0;
+	static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+	if (!unknown_nmi_panic)
+		return NMI_DONE;
+
+	if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+		return NMI_HANDLED;
+
+	return NMI_DONE;
 }
+#endif
 
 static void __init ms_hyperv_init_platform(void)
 {
+	int hv_host_info_eax;
+	int hv_host_info_ebx;
+	int hv_host_info_ecx;
+	int hv_host_info_edx;
+
 	/*
 	 * Extract the features and hints
 	 */
@@ -170,6 +175,21 @@ static void __init ms_hyperv_init_platform(void)
 	pr_info("HyperV: features 0x%x, hints 0x%x\n",
 		ms_hyperv.features, ms_hyperv.hints);
 
+	/*
+	 * Extract host information.
+	 */
+	if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
+		hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
+		hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
+		hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
+		hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+
+		pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+			hv_host_info_eax, hv_host_info_ebx >> 16,
+			hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+			hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+	}
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
 		/*
@@ -183,10 +203,10 @@ static void __init ms_hyperv_init_platform(void)
 		pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
 			lapic_timer_frequency);
 	}
-#endif
 
-	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
-		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+	register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+			     "hv_nmi_unknown");
+#endif
 
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
@@ -204,6 +224,13 @@ static void __init ms_hyperv_init_platform(void)
 	 */
 	if (efi_enabled(EFI_BOOT))
 		x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+	/*
+	 * Setup the hook to get control post apic initialization.
+	 */
+	x86_platform.apic_post_init = hyperv_init;
+#endif
 }
 
 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 1db8dc490b66..d9794060fe22 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -17,11 +17,20 @@ struct cpuid_bit {
 	u32 sub_leaf;
 };
 
-enum cpuid_regs {
-	CR_EAX = 0,
-	CR_ECX,
-	CR_EDX,
-	CR_EBX
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+	{ X86_FEATURE_APERFMPERF,       CPUID_ECX,  0, 0x00000006, 0 },
+	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 },
+	{ X86_FEATURE_INTEL_PT,		CPUID_EBX, 25, 0x00000007, 0 },
+	{ X86_FEATURE_AVX512_4VNNIW,    CPUID_EDX,  2, 0x00000007, 0 },
+	{ X86_FEATURE_AVX512_4FMAPS,    CPUID_EDX,  3, 0x00000007, 0 },
+	{ X86_FEATURE_CAT_L3,		CPUID_EBX,  1, 0x00000010, 0 },
+	{ X86_FEATURE_CAT_L2,		CPUID_EBX,  2, 0x00000010, 0 },
+	{ X86_FEATURE_CDP_L3,		CPUID_ECX,  2, 0x00000010, 1 },
+	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
+	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
+	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+	{ 0, 0, 0, 0, 0 }
 };
 
 void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
@@ -30,18 +39,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	u32 regs[4];
 	const struct cpuid_bit *cb;
 
-	static const struct cpuid_bit cpuid_bits[] = {
-		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
-		{ X86_FEATURE_AVX512_4VNNIW,	CR_EDX, 2, 0x00000007, 0 },
-		{ X86_FEATURE_AVX512_4FMAPS,	CR_EDX, 3, 0x00000007, 0 },
-		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
-		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
-		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
-		{ X86_FEATURE_CPB,		CR_EDX, 9, 0x80000007, 0 },
-		{ X86_FEATURE_PROC_FEEDBACK,	CR_EDX,11, 0x80000007, 0 },
-		{ 0, 0, 0, 0, 0 }
-	};
-
 	for (cb = cpuid_bits; cb->feature; cb++) {
 
 		/* Verify that the level is valid */
@@ -50,10 +47,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		    max_level > (cb->level | 0xffff))
 			continue;
 
-		cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
-			    &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+		cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+			    &regs[CPUID_EBX], &regs[CPUID_ECX],
+			    &regs[CPUID_EDX]);
 
 		if (regs[cb->reg] & (1 << cb->bit))
 			set_cpu_cap(c, cb->feature);
 	}
 }
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+			     enum cpuid_regs_idx reg)
+{
+	const struct cpuid_bit *cb;
+	u32 cpuid_val = 0;
+
+	for (cb = cpuid_bits; cb->feature; cb++) {
+
+		if (level > cb->level)
+			continue;
+
+		if (level < cb->level)
+			break;
+
+		if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+			if (cpu_has(&boot_cpu_data, cb->feature))
+				cpuid_val |= BIT(cb->bit);
+		}
+	}
+
+	return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..d77d07ab310b 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,6 @@
 #include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/mm.h>
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5130985b758b..22403a28caf5 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,12 +24,16 @@
 #include <linux/dmi.h>
 #include <linux/init.h>
 #include <linux/export.h>
+#include <linux/clocksource.h>
 #include <asm/div64.h>
 #include <asm/x86_init.h>
 #include <asm/hypervisor.h>
 #include <asm/timer.h>
 #include <asm/apic.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"vmware: " fmt
+
 #define CPUID_VMWARE_INFO_LEAF	0x40000000
 #define VMWARE_HYPERVISOR_MAGIC	0x564D5868
 #define VMWARE_HYPERVISOR_PORT	0x5658
@@ -48,6 +52,8 @@
 			"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) :	\
 			"memory");
 
+static unsigned long vmware_tsc_khz __ro_after_init;
+
 static inline int __vmware_platform(void)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -57,35 +63,80 @@ static inline int __vmware_platform(void)
 
 static unsigned long vmware_get_tsc_khz(void)
 {
-	uint64_t tsc_hz, lpj;
-	uint32_t eax, ebx, ecx, edx;
+	return vmware_tsc_khz;
+}
 
-	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
 
-	tsc_hz = eax | (((uint64_t)ebx) << 32);
-	do_div(tsc_hz, 1000);
-	BUG_ON(tsc_hz >> 32);
-	pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
-			 (unsigned long) tsc_hz / 1000,
-			 (unsigned long) tsc_hz % 1000);
-
-	if (!preset_lpj) {
-		lpj = ((u64)tsc_hz * 1000);
-		do_div(lpj, HZ);
-		preset_lpj = lpj;
-	}
+static __init int setup_vmw_sched_clock(char *s)
+{
+	vmw_sched_clock = 0;
+	return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
 
-	return tsc_hz;
+static unsigned long long vmware_sched_clock(void)
+{
+	unsigned long long ns;
+
+	ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+			     vmware_cyc2ns.cyc2ns_shift);
+	ns -= vmware_cyc2ns.cyc2ns_offset;
+	return ns;
 }
 
+static void __init vmware_sched_clock_setup(void)
+{
+	struct cyc2ns_data *d = &vmware_cyc2ns;
+	unsigned long long tsc_now = rdtsc();
+
+	clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+			       vmware_tsc_khz, NSEC_PER_MSEC, 0);
+	d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+					   d->cyc2ns_shift);
+
+	pv_time_ops.sched_clock = vmware_sched_clock;
+	pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+	pv_info.name = "VMware hypervisor";
+	pv_cpu_ops.io_delay = paravirt_nop;
+
+	if (vmware_tsc_khz && vmw_sched_clock)
+		vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
 static void __init vmware_platform_setup(void)
 {
 	uint32_t eax, ebx, ecx, edx;
+	uint64_t lpj, tsc_khz;
 
 	VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
 
 	if (ebx != UINT_MAX) {
+		lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+		do_div(tsc_khz, 1000);
+		WARN_ON(tsc_khz >> 32);
+		pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+			(unsigned long) tsc_khz / 1000,
+			(unsigned long) tsc_khz % 1000);
+
+		if (!preset_lpj) {
+			do_div(lpj, HZ);
+			preset_lpj = lpj;
+		}
+
+		vmware_tsc_khz = tsc_khz;
 		x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+		x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
 #ifdef CONFIG_X86_LOCAL_APIC
 		/* Skip lapic calibration since we know the bus frequency. */
 		lapic_timer_frequency = ecx / HZ;
@@ -96,6 +147,8 @@ static void __init vmware_platform_setup(void)
 		pr_warn("Failed to get TSC freq from the hypervisor\n");
 	}
 
+	vmware_paravirt_ops_setup();
+
 #ifdef CONFIG_X86_IO_APIC
 	no_timer_check = 1;
 #endif
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2836de390f95..0931a105ffe1 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -45,10 +45,7 @@
 #include <asm/msr.h>
 
 static struct class *cpuid_class;
-
-struct cpuid_regs {
-	u32 eax, ebx, ecx, edx;
-};
+static enum cpuhp_state cpuhp_cpuid_state;
 
 static void cpuid_smp_cpuid(void *cmd_block)
 {
@@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = {
 	.open = cpuid_open,
 };
 
-static int cpuid_device_create(int cpu)
+static int cpuid_device_create(unsigned int cpu)
 {
 	struct device *dev;
 
@@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu)
 	return PTR_ERR_OR_ZERO(dev);
 }
 
-static void cpuid_device_destroy(int cpu)
+static int cpuid_device_destroy(unsigned int cpu)
 {
 	device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+	return 0;
 }
 
-static int cpuid_class_cpu_callback(struct notifier_block *nfb,
-				    unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	int err = 0;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-		err = cpuid_device_create(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-		cpuid_device_destroy(cpu);
-		break;
-	}
-	return notifier_from_errno(err);
-}
-
-static struct notifier_block cpuid_class_cpu_notifier =
-{
-	.notifier_call = cpuid_class_cpu_callback,
-};
-
 static char *cpuid_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
@@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
 
 static int __init cpuid_init(void)
 {
-	int i, err = 0;
-	i = 0;
+	int err;
 
 	if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
 			      "cpu/cpuid", &cpuid_fops)) {
 		printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
 		       CPUID_MAJOR);
-		err = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 	cpuid_class = class_create(THIS_MODULE, "cpuid");
 	if (IS_ERR(cpuid_class)) {
@@ -177,45 +149,28 @@ static int __init cpuid_init(void)
 	}
 	cpuid_class->devnode = cpuid_devnode;
 
-	cpu_notifier_register_begin();
-	for_each_online_cpu(i) {
-		err = cpuid_device_create(i);
-		if (err != 0)
-			goto out_class;
-	}
-	__register_hotcpu_notifier(&cpuid_class_cpu_notifier);
-	cpu_notifier_register_done();
+	err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+				cpuid_device_create, cpuid_device_destroy);
+	if (err < 0)
+		goto out_class;
 
-	err = 0;
-	goto out;
+	cpuhp_cpuid_state = err;
+	return 0;
 
 out_class:
-	i = 0;
-	for_each_online_cpu(i) {
-		cpuid_device_destroy(i);
-	}
-	cpu_notifier_register_done();
 	class_destroy(cpuid_class);
 out_chrdev:
 	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-out:
 	return err;
 }
+module_init(cpuid_init);
 
 static void __exit cpuid_exit(void)
 {
-	int cpu = 0;
-
-	cpu_notifier_register_begin();
-	for_each_online_cpu(cpu)
-		cpuid_device_destroy(cpu);
+	cpuhp_remove_state(cpuhp_cpuid_state);
 	class_destroy(cpuid_class);
 	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-	__unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
-	cpu_notifier_register_done();
 }
-
-module_init(cpuid_init);
 module_exit(cpuid_exit);
 
 MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 650830e39e3a..3741461c63a0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg)
 
 int crash_load_segments(struct kimage *image)
 {
-	unsigned long src_start, src_sz, elf_sz;
-	void *elf_addr;
 	int ret;
+	struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+				  .buf_max = ULONG_MAX, .top_down = false };
 
 	/*
 	 * Determine and load a segment for backup area. First 640K RAM
@@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image)
 	if (ret < 0)
 		return ret;
 
-	src_start = image->arch.backup_src_start;
-	src_sz = image->arch.backup_src_sz;
-
 	/* Add backup segment. */
-	if (src_sz) {
+	if (image->arch.backup_src_sz) {
+		kbuf.buffer = &crash_zero_bytes;
+		kbuf.bufsz = sizeof(crash_zero_bytes);
+		kbuf.memsz = image->arch.backup_src_sz;
+		kbuf.buf_align = PAGE_SIZE;
 		/*
 		 * Ideally there is no source for backup segment. This is
 		 * copied in purgatory after crash. Just add a zero filled
 		 * segment for now to make sure checksum logic works fine.
 		 */
-		ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
-				       sizeof(crash_zero_bytes), src_sz,
-				       PAGE_SIZE, 0, -1, 0,
-				       &image->arch.backup_load_addr);
+		ret = kexec_add_buffer(&kbuf);
 		if (ret)
 			return ret;
+		image->arch.backup_load_addr = kbuf.mem;
 		pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
-			 image->arch.backup_load_addr, src_start, src_sz);
+			 image->arch.backup_load_addr,
+			 image->arch.backup_src_start, kbuf.memsz);
 	}
 
 	/* Prepare elf headers and add a segment */
-	ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+	ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
 	if (ret)
 		return ret;
 
-	image->arch.elf_headers = elf_addr;
-	image->arch.elf_headers_sz = elf_sz;
+	image->arch.elf_headers = kbuf.buffer;
+	image->arch.elf_headers_sz = kbuf.bufsz;
 
-	ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
-			ELF_CORE_HEADER_ALIGN, 0, -1, 0,
-			&image->arch.elf_load_addr);
+	kbuf.memsz = kbuf.bufsz;
+	kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret) {
 		vfree((void *)image->arch.elf_headers);
 		return ret;
 	}
+	image->arch.elf_load_addr = kbuf.mem;
 	pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 image->arch.elf_load_addr, elf_sz, elf_sz);
+		 image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
 
 	return ret;
 }
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 11891ca7b716..538fedea9b3f 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -10,7 +10,7 @@
 #include <linux/highmem.h>
 #include <linux/crash_dump.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static void *kdump_buf_page;
 
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index f6dfd9334b67..f9c324e08d85 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,9 +1,10 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
 #include <linux/init_task.h>
 #include <linux/fs.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 85f854b98a9d..09d4ac0d2661 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,8 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
@@ -22,7 +24,6 @@
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
 unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -46,14 +47,7 @@ static void printk_stack_address(unsigned long address, int reliable,
 				 char *log_lvl)
 {
 	touch_nmi_watchdog();
-	printk("%s [<%p>] %s%pB\n",
-		log_lvl, (void *)address, reliable ? "" : "? ",
-		(void *)address);
-}
-
-void printk_address(unsigned long address)
-{
-	pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
+	printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
 void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -67,6 +61,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	printk("%sCall Trace:\n", log_lvl);
 
 	unwind_start(&state, task, regs, stack);
+	stack = stack ? : get_stack_pointer(task, regs);
 
 	/*
 	 * Iterate through the stacks, starting with the current stack pointer.
@@ -82,8 +77,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	 * - softirq stack
 	 * - hardirq stack
 	 */
-	for (; stack; stack = stack_info.next_sp) {
-		const char *str_begin, *str_end;
+	for (regs = NULL; stack; stack = stack_info.next_sp) {
+		const char *stack_name;
 
 		/*
 		 * If we overflowed the task stack into a guard page, jump back
@@ -95,9 +90,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (get_stack_info(stack, task, &stack_info, &visit_mask))
 			break;
 
-		stack_type_str(stack_info.type, &str_begin, &str_end);
-		if (str_begin)
-			printk("%s <%s> ", log_lvl, str_begin);
+		stack_name = stack_type_name(stack_info.type);
+		if (stack_name)
+			printk("%s <%s>\n", log_lvl, stack_name);
 
 		/*
 		 * Scan the stack, printing any text addresses we find.  At the
@@ -119,6 +114,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			if (!__kernel_text_address(addr))
 				continue;
 
+			/*
+			 * Don't print regs->ip again if it was already printed
+			 * by __show_regs() below.
+			 */
+			if (regs && stack == &regs->ip) {
+				unwind_next_frame(&state);
+				continue;
+			}
+
 			if (stack == ret_addr_p)
 				reliable = 1;
 
@@ -146,10 +150,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 			 * of the addresses will just be printed as unreliable.
 			 */
 			unwind_next_frame(&state);
+
+			/* if the frame has entry regs, print them */
+			regs = unwind_get_entry_regs(&state);
+			if (regs)
+				__show_regs(regs, 0);
 		}
 
-		if (str_end)
-			printk("%s <%s> ", log_lvl, str_end);
+		if (stack_name)
+			printk("%s </%s>\n", log_lvl, stack_name);
 	}
 }
 
@@ -164,12 +173,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	if (!sp && task == current)
 		sp = get_stack_pointer(current, NULL);
 
-	show_stack_log_lvl(task, NULL, sp, "");
+	show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
 }
 
 void show_stack_regs(struct pt_regs *regs)
 {
-	show_stack_log_lvl(current, regs, NULL, "");
+	show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
 }
 
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -261,14 +270,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
 		sp = kernel_stack_pointer(regs);
 		savesegment(ss, ss);
 	}
-	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-	print_symbol("%s", regs->ip);
-	printk(" SS:ESP %04x:%08lx\n", ss, sp);
+	printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
+	       (void *)regs->ip, ss, sp);
 #else
 	/* Executive summary in case the oops scrolled away */
-	printk(KERN_ALERT "RIP ");
-	printk_address(regs->ip);
-	printk(" RSP <%016lx>\n", regs->sp);
+	printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
 #endif
 	return 0;
 }
@@ -291,22 +297,6 @@ void die(const char *str, struct pt_regs *regs, long err)
 	oops_end(flags, regs, sig);
 }
 
-static int __init kstack_setup(char *s)
-{
-	ssize_t ret;
-	unsigned long val;
-
-	if (!s)
-		return -EINVAL;
-
-	ret = kstrtoul(s, 0, &val);
-	if (ret)
-		return ret;
-	kstack_depth_to_print = val;
-	return 0;
-}
-early_param("kstack", kstack_setup);
-
 static int __init code_bytes_setup(char *s)
 {
 	ssize_t ret;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 06eb322b5f9f..b0b3a3df7c20 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
@@ -16,18 +17,15 @@
 
 #include <asm/stacktrace.h>
 
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
 {
-	switch (type) {
-	case STACK_TYPE_IRQ:
-	case STACK_TYPE_SOFTIRQ:
-		*begin = "IRQ";
-		*end   = "EOI";
-		break;
-	default:
-		*begin = NULL;
-		*end   = NULL;
-	}
+	if (type == STACK_TYPE_IRQ)
+		return "IRQ";
+
+	if (type == STACK_TYPE_SOFTIRQ)
+		return "SOFTIRQ";
+
+	return NULL;
 }
 
 static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
@@ -109,8 +107,10 @@ recursion_check:
 	 * just break out and report an unknown stack type.
 	 */
 	if (visit_mask) {
-		if (*visit_mask & (1UL << info->type))
+		if (*visit_mask & (1UL << info->type)) {
+			printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
 			goto unknown;
+		}
 		*visit_mask |= 1UL << info->type;
 	}
 
@@ -121,36 +121,6 @@ unknown:
 	return -EINVAL;
 }
 
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			unsigned long *sp, char *log_lvl)
-{
-	unsigned long *stack;
-	int i;
-
-	if (!try_get_task_stack(task))
-		return;
-
-	sp = sp ? : get_stack_pointer(task, regs);
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		if (kstack_end(stack))
-			break;
-		if ((i % STACKSLOTS_PER_LINE) == 0) {
-			if (i != 0)
-				pr_cont("\n");
-			printk("%s %08lx", log_lvl, *stack++);
-		} else
-			pr_cont(" %08lx", *stack++);
-		touch_nmi_watchdog();
-	}
-	pr_cont("\n");
-	show_trace_log_lvl(task, regs, sp, log_lvl);
-
-	put_task_stack(task);
-}
-
-
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		pr_emerg("Stack:\n");
-		show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
+		show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
 
 		pr_emerg("Code:");
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 36cf1a498227..a8b117e93b46 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -2,6 +2,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
  */
+#include <linux/sched/debug.h>
 #include <linux/kallsyms.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
@@ -28,23 +29,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	[DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
 {
 	BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
 
-	switch (type) {
-	case STACK_TYPE_IRQ:
-		*begin = "IRQ";
-		*end   = "EOI";
-		break;
-	case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
-		*begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
-		*end   = "EOE";
-		break;
-	default:
-		*begin = NULL;
-		*end   = NULL;
-	}
+	if (type == STACK_TYPE_IRQ)
+		return "IRQ";
+
+	if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+		return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+	return NULL;
 }
 
 static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
@@ -128,8 +123,10 @@ recursion_check:
 	 * just break out and report an unknown stack type.
 	 */
 	if (visit_mask) {
-		if (*visit_mask & (1UL << info->type))
+		if (*visit_mask & (1UL << info->type)) {
+			printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
 			goto unknown;
+		}
 		*visit_mask |= 1UL << info->type;
 	}
 
@@ -140,56 +137,6 @@ unknown:
 	return -EINVAL;
 }
 
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-			unsigned long *sp, char *log_lvl)
-{
-	unsigned long *irq_stack_end;
-	unsigned long *irq_stack;
-	unsigned long *stack;
-	int i;
-
-	if (!try_get_task_stack(task))
-		return;
-
-	irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
-	irq_stack     = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
-
-	sp = sp ? : get_stack_pointer(task, regs);
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		unsigned long word;
-
-		if (stack >= irq_stack && stack <= irq_stack_end) {
-			if (stack == irq_stack_end) {
-				stack = (unsigned long *) (irq_stack_end[-1]);
-				pr_cont(" <EOI> ");
-			}
-		} else {
-		if (kstack_end(stack))
-			break;
-		}
-
-		if (probe_kernel_address(stack, word))
-			break;
-
-		if ((i % STACKSLOTS_PER_LINE) == 0) {
-			if (i != 0)
-				pr_cont("\n");
-			printk("%s %016lx", log_lvl, word);
-		} else
-			pr_cont(" %016lx", word);
-
-		stack++;
-		touch_nmi_watchdog();
-	}
-
-	pr_cont("\n");
-	show_trace_log_lvl(task, regs, sp, log_lvl);
-
-	put_task_stack(task);
-}
-
 void show_regs(struct pt_regs *regs)
 {
 	int i;
@@ -207,8 +154,7 @@ void show_regs(struct pt_regs *regs)
 		unsigned char c;
 		u8 *ip;
 
-		printk(KERN_DEFAULT "Stack:\n");
-		show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+		show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
 
 		printk(KERN_DEFAULT "Code: ");
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 90e8dde3ec26..b2bbad6ebe4d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -580,24 +580,19 @@ static void __init update_e820_saved(void)
 }
 #define MAX_GAP_END 0x100000000ull
 /*
- * Search for a gap in the e820 memory space from start_addr to end_addr.
+ * Search for a gap in the e820 memory space from 0 to MAX_GAP_END.
  */
-__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
-		unsigned long start_addr, unsigned long long end_addr)
+static int __init e820_search_gap(unsigned long *gapstart,
+		unsigned long *gapsize)
 {
-	unsigned long long last;
+	unsigned long long last = MAX_GAP_END;
 	int i = e820->nr_map;
 	int found = 0;
 
-	last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
-
 	while (--i >= 0) {
 		unsigned long long start = e820->map[i].addr;
 		unsigned long long end = start + e820->map[i].size;
 
-		if (end < start_addr)
-			continue;
-
 		/*
 		 * Since "last" is at most 4GB, we know we'll
 		 * fit in 32 bits if this condition is true
@@ -628,18 +623,19 @@ __init void e820_setup_gap(void)
 	unsigned long gapstart, gapsize;
 	int found;
 
-	gapstart = 0x10000000;
 	gapsize = 0x400000;
-	found  = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+	found  = e820_search_gap(&gapstart, &gapsize);
 
-#ifdef CONFIG_X86_64
 	if (!found) {
+#ifdef CONFIG_X86_64
 		gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR
 	"e820: cannot find a gap in the 32bit address range\n"
 	"e820: PCI devices with unassigned 32bit BARs may break!\n");
-	}
+#else
+		gapstart = 0x10000000;
 #endif
+	}
 
 	/*
 	 * e820_reserve_resources_late protect stolen RAM already
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index aad34aafc0e0..d913047f832c 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -23,17 +23,12 @@ static double __initdata y = 3145727.0;
  */
 void __init fpu__init_check_bugs(void)
 {
-	u32 cr0_saved;
 	s32 fdiv_bug;
 
 	/* kernel_fpu_begin/end() relies on patched alternative instructions. */
 	if (!boot_cpu_has(X86_FEATURE_FPU))
 		return;
 
-	/* We might have CR0::TS set already, clear it: */
-	cr0_saved = read_cr0();
-	write_cr0(cr0_saved & ~X86_CR0_TS);
-
 	kernel_fpu_begin();
 
 	/*
@@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void)
 
 	kernel_fpu_end();
 
-	write_cr0(cr0_saved);
-
 	if (fdiv_bug) {
 		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
 		pr_warn("Hmm, FPU with FDIV bug\n");
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ebb4e95fbd74..e1114f070c2d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void)
 	return this_cpu_read(in_kernel_fpu);
 }
 
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
 static bool interrupted_kernel_fpu_idle(void)
 {
-	if (kernel_fpu_disabled())
-		return false;
-
-	if (use_eager_fpu())
-		return true;
-
-	return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+	return !kernel_fpu_disabled();
 }
 
 /*
@@ -125,8 +107,7 @@ void __kernel_fpu_begin(void)
 		 */
 		copy_fpregs_to_fpstate(fpu);
 	} else {
-		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
-		__fpregs_activate_hw();
+		__cpu_invalidate_fpregs_state();
 	}
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -137,8 +118,6 @@ void __kernel_fpu_end(void)
 
 	if (fpu->fpregs_active)
 		copy_kernel_to_fpregs(&fpu->state);
-	else
-		__fpregs_deactivate_hw();
 
 	kernel_fpu_enable();
 }
@@ -159,35 +138,6 @@ void kernel_fpu_end(void)
 EXPORT_SYMBOL_GPL(kernel_fpu_end);
 
 /*
- * CR0::TS save/restore functions:
- */
-int irq_ts_save(void)
-{
-	/*
-	 * If in process context and not atomic, we can take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context requires disabling preemption
-	 * or some heavy lifting like kernel_fpu_begin()
-	 */
-	if (!in_atomic())
-		return 0;
-
-	if (read_cr0() & X86_CR0_TS) {
-		clts();
-		return 1;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(irq_ts_save);
-
-void irq_ts_restore(int TS_state)
-{
-	if (TS_state)
-		stts();
-}
-EXPORT_SYMBOL_GPL(irq_ts_restore);
-
-/*
  * Save the FPU state (mark it for reload if necessary):
  *
  * This only ever gets called for the current task.
@@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu)
 	trace_x86_fpu_before_save(fpu);
 	if (fpu->fpregs_active) {
 		if (!copy_fpregs_to_fpstate(fpu)) {
-			if (use_eager_fpu())
-				copy_kernel_to_fpregs(&fpu->state);
-			else
-				fpregs_deactivate(fpu);
+			copy_kernel_to_fpregs(&fpu->state);
 		}
 	}
 	trace_x86_fpu_after_save(fpu);
@@ -231,13 +178,8 @@ void fpstate_init(union fpregs_state *state)
 
 	memset(state, 0, fpu_kernel_xstate_size);
 
-	/*
-	 * XRSTORS requires that this bit is set in xcomp_bv, or
-	 * it will #GP. Make sure it is replaced after the memset().
-	 */
 	if (static_cpu_has(X86_FEATURE_XSAVES))
-		state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT;
-
+		fpstate_init_xstate(&state->xsave);
 	if (static_cpu_has(X86_FEATURE_FXSR))
 		fpstate_init_fxstate(&state->fxsave);
 	else
@@ -247,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
 
 int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 {
-	dst_fpu->counter = 0;
 	dst_fpu->fpregs_active = 0;
 	dst_fpu->last_cpu = -1;
 
@@ -260,8 +201,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 	 * Don't let 'init optimized' areas of the XSAVE area
 	 * leak into the child task:
 	 */
-	if (use_eager_fpu())
-		memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+	memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
 
 	/*
 	 * Save current FPU registers directly into the child
@@ -283,10 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
 		memcpy(&src_fpu->state, &dst_fpu->state,
 		       fpu_kernel_xstate_size);
 
-		if (use_eager_fpu())
-			copy_kernel_to_fpregs(&src_fpu->state);
-		else
-			fpregs_deactivate(src_fpu);
+		copy_kernel_to_fpregs(&src_fpu->state);
 	}
 	preempt_enable();
 
@@ -366,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
 
 	if (fpu->fpstate_active) {
 		/* Invalidate any lazy state: */
-		fpu->last_cpu = -1;
+		__fpu_invalidate_fpregs_state(fpu);
 	} else {
 		fpstate_init(&fpu->state);
 		trace_x86_fpu_init_state(fpu);
@@ -409,7 +346,7 @@ void fpu__current_fpstate_write_begin(void)
 	 * ensures we will not be lazy and skip a XRSTOR in the
 	 * future.
 	 */
-	fpu->last_cpu = -1;
+	__fpu_invalidate_fpregs_state(fpu);
 }
 
 /*
@@ -459,7 +396,6 @@ void fpu__restore(struct fpu *fpu)
 	trace_x86_fpu_before_restore(fpu);
 	fpregs_activate(fpu);
 	copy_kernel_to_fpregs(&fpu->state);
-	fpu->counter++;
 	trace_x86_fpu_after_restore(fpu);
 	kernel_fpu_enable();
 }
@@ -477,7 +413,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
 void fpu__drop(struct fpu *fpu)
 {
 	preempt_disable();
-	fpu->counter = 0;
 
 	if (fpu->fpregs_active) {
 		/* Ignore delayed exceptions from user space */
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7ccb85..c2f8dde3255c 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -7,21 +7,10 @@
 #include <asm/cmdline.h>
 
 #include <linux/sched.h>
+#include <linux/sched/task.h>
 #include <linux/init.h>
 
 /*
- * Initialize the TS bit in CR0 according to the style of context-switches
- * we are using:
- */
-static void fpu__init_cpu_ctx_switch(void)
-{
-	if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
-		stts();
-	else
-		clts();
-}
-
-/*
  * Initialize the registers found in all CPUs, CR0 and CR4:
  */
 static void fpu__init_cpu_generic(void)
@@ -58,16 +47,9 @@ void fpu__init_cpu(void)
 {
 	fpu__init_cpu_generic();
 	fpu__init_cpu_xstate();
-	fpu__init_cpu_ctx_switch();
 }
 
-/*
- * The earliest FPU detection code.
- *
- * Set the X86_FEATURE_FPU CPU-capability bit based on
- * trying to execute an actual sequence of FPU instructions:
- */
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static bool fpu__probe_without_cpuid(void)
 {
 	unsigned long cr0;
 	u16 fsw, fcw;
@@ -78,18 +60,25 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
 	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
 	write_cr0(cr0);
 
-	if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
-		asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-			     : "+m" (fsw), "+m" (fcw));
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+	pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
 
-		if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-			set_cpu_cap(c, X86_FEATURE_FPU);
+	return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+	if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+	    !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+		if (fpu__probe_without_cpuid())
+			setup_force_cpu_cap(X86_FEATURE_FPU);
 		else
-			clear_cpu_cap(c, X86_FEATURE_FPU);
+			setup_clear_cpu_cap(X86_FEATURE_FPU);
 	}
 
 #ifndef CONFIG_MATH_EMULATION
-	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
 		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
 		for (;;)
 			asm volatile("hlt");
@@ -233,82 +222,16 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 }
 
 /*
- * FPU context switching strategies:
- *
- * Against popular belief, we don't do lazy FPU saves, due to the
- * task migration complications it brings on SMP - we only do
- * lazy FPU restores.
- *
- * 'lazy' is the traditional strategy, which is based on setting
- * CR0::TS to 1 during context-switch (instead of doing a full
- * restore of the FPU state), which causes the first FPU instruction
- * after the context switch (whenever it is executed) to fault - at
- * which point we lazily restore the FPU state into FPU registers.
- *
- * Tasks are of course under no obligation to execute FPU instructions,
- * so it can easily happen that another context-switch occurs without
- * a single FPU instruction being executed. If we eventually switch
- * back to the original task (that still owns the FPU) then we have
- * not only saved the restores along the way, but we also have the
- * FPU ready to be used for the original task.
- *
- * 'lazy' is deprecated because it's almost never a performance win
- * and it's much more complicated than 'eager'.
- *
- * 'eager' switching is by default on all CPUs, there we switch the FPU
- * state during every context switch, regardless of whether the task
- * has used FPU instructions in that time slice or not. This is done
- * because modern FPU context saving instructions are able to optimize
- * state saving and restoration in hardware: they can detect both
- * unused and untouched FPU state and optimize accordingly.
- *
- * [ Note that even in 'lazy' mode we might optimize context switches
- *   to use 'eager' restores, if we detect that a task is using the FPU
- *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
- */
-static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
-
-/*
  * Find supported xfeatures based on cpu features and command-line input.
  * This must be called after fpu__init_parse_early_param() is called and
  * xfeatures_mask is enumerated.
  */
 u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-	/* Support all xfeatures known to us */
-	if (eagerfpu != DISABLE)
-		return XCNTXT_MASK;
-
-	/* Warning of xfeatures being disabled for no eagerfpu mode */
-	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
-		pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
-			xfeatures_mask & XFEATURE_MASK_EAGER);
-	}
-
-	/* Return a mask that masks out all features requiring eagerfpu mode */
-	return ~XFEATURE_MASK_EAGER;
+	return XCNTXT_MASK;
 }
 
-/*
- * Disable features dependent on eagerfpu.
- */
-static void __init fpu__clear_eager_fpu_features(void)
-{
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
-}
-
-/*
- * Pick the FPU context switching strategy:
- *
- * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
- * the following is true:
- *
- * (1) the cpu has xsaveopt, as it has the optimization and doing eager
- *     FPU switching has a relatively low cost compared to a plain xsave;
- * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
- *     switching. Should the kernel boot with noxsaveopt, we support MPX
- *     with eager FPU switching at a higher cost.
- */
+/* Legacy code to initialize eager fpu mode. */
 static void __init fpu__init_system_ctx_switch(void)
 {
 	static bool on_boot_cpu __initdata = 1;
@@ -317,17 +240,6 @@ static void __init fpu__init_system_ctx_switch(void)
 	on_boot_cpu = 0;
 
 	WARN_ON_FPU(current->thread.fpu.fpstate_active);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
-		eagerfpu = ENABLE;
-
-	if (xfeatures_mask & XFEATURE_MASK_EAGER)
-		eagerfpu = ENABLE;
-
-	if (eagerfpu == ENABLE)
-		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
-	printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
 }
 
 /*
@@ -336,11 +248,6 @@ static void __init fpu__init_system_ctx_switch(void)
  */
 static void __init fpu__init_parse_early_param(void)
 {
-	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
-		eagerfpu = DISABLE;
-		fpu__clear_eager_fpu_features();
-	}
-
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 		setup_clear_cpu_cap(X86_FEATURE_FPU);
 
@@ -375,14 +282,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 	 */
 	fpu__init_cpu();
 
-	/*
-	 * But don't leave CR0::TS set yet, as some of the FPU setup
-	 * methods depend on being able to execute FPU instructions
-	 * that will fault on a set TS, such as the FXSAVE in
-	 * fpu__init_system_mxcsr().
-	 */
-	clts();
-
 	fpu__init_system_generic();
 	fpu__init_system_xstate_size_legacy();
 	fpu__init_system_xstate();
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c114b132d121..b188b16841e3 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,6 +5,7 @@
 #include <asm/fpu/signal.h>
 #include <asm/fpu/regset.h>
 #include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
 
 /*
  * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a184c210efba..83c23c230b4c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
 		}
 
 		fpu->fpstate_active = 1;
-		if (use_eager_fpu()) {
-			preempt_disable();
-			fpu__restore(fpu);
-			preempt_enable();
-		}
+		preempt_disable();
+		fpu__restore(fpu);
+		preempt_enable();
 
 		return err;
 	} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 095ef7ddd6ae..c24ac1efb12d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX);
 	setup_clear_cpu_cap(X86_FEATURE_AVX2);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -73,9 +74,11 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
 	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
 	setup_clear_cpu_cap(X86_FEATURE_PKU);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
 }
 
 /*
@@ -703,8 +706,14 @@ void __init fpu__init_system_xstate(void)
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 
+	if (!boot_cpu_has(X86_FEATURE_FPU)) {
+		pr_info("x86/fpu: No FPU detected\n");
+		return;
+	}
+
 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
-		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+		pr_info("x86/fpu: x87 FPU will use %s\n",
+			boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
 		return;
 	}
 
@@ -890,15 +899,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return -EINVAL;
-	/*
-	 * For most XSAVE components, this would be an arduous task:
-	 * brining fpstate up to date with fpregs, updating fpstate,
-	 * then re-populating fpregs.  But, for components that are
-	 * never lazily managed, we can just access the fpregs
-	 * directly.  PKRU is never managed lazily, so we can just
-	 * manipulate it directly.  Make sure it stays that way.
-	 */
-	WARN_ON_ONCE(!use_eager_fpu());
 
 	/* Set the bits we need in PKRU:  */
 	if (init_val & PKEY_DISABLE_ACCESS)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8639bb2ae058..cbd73eb42170 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,12 @@
 #include <asm/ftrace.h>
 #include <asm/nops.h>
 
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && \
+	!defined(CC_USING_FENTRY) && \
+	!defined(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE)
+# error The following combination is not supported: ((compiler missing -mfentry) || (CONFIG_X86_32 and !CONFIG_DYNAMIC_FTRACE)) && CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_CC_OPTIMIZE_FOR_SIZE
+#endif
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int ftrace_arch_code_modify_prepare(void)
@@ -535,7 +541,7 @@ static void run_sync(void)
 {
 	int enable_irqs = irqs_disabled();
 
-	/* We may be called with interrupts disbled (on bootup). */
+	/* We may be called with interrupts disabled (on bootup). */
 	if (enable_irqs)
 		local_irq_enable();
 	on_each_cpu(do_sync_core, NULL, 1);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index f16c55bfc090..e5fb436a6548 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void)
 
 	start_kernel();
 }
+
+/*
+ * Initialize page tables.  This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base.  The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+void __init mk_early_pgtbl_32(void)
+{
+#ifdef __pa
+#undef __pa
+#endif
+#define __pa(x)  ((unsigned long)(x) - PAGE_OFFSET)
+	pte_t pte, *ptep;
+	int i;
+	unsigned long *ptr;
+	/* Enough space to fit pagetables for the low memory linear map */
+	const unsigned long limit = __pa(_end) +
+		(PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+#ifdef CONFIG_X86_PAE
+	pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
+#define SET_PL2(pl2, val)    { (pl2).pmd = (val); }
+#else
+	pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
+#define SET_PL2(pl2, val)   { (pl2).pgd = (val); }
+#endif
+
+	ptep = (pte_t *)__pa(__brk_base);
+	pte.pte = PTE_IDENT_ATTR;
+
+	while ((pte.pte & PTE_PFN_MASK) < limit) {
+
+		SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
+		*pl2p = pl2;
+#ifndef CONFIG_X86_PAE
+		/* Kernel PDE entry */
+		*(pl2p +  ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+#endif
+		for (i = 0; i < PTRS_PER_PTE; i++) {
+			*ptep = pte;
+			pte.pte += PAGE_SIZE;
+			ptep++;
+		}
+
+		pl2p++;
+	}
+
+	ptr = (unsigned long *)__pa(&max_pfn_mapped);
+	/* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+	*ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+	ptr = (unsigned long *)__pa(&_brk_end);
+	*ptr = (unsigned long)ptep + PAGE_OFFSET;
+}
+
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 54a2372f5dbb..b5785c197e53 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  */
 
+#define DISABLE_BRANCH_PROFILING
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <linux/types.h>
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 2dabea46f039..1f85ee8f9439 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -24,6 +24,7 @@
 #include <asm/nops.h>
 #include <asm/bootparam.h>
 #include <asm/export.h>
+#include <asm/pgtable_32.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -41,40 +42,8 @@
 #define X86_CAPABILITY	new_cpu_data+CPUINFO_x86_capability
 #define X86_VENDOR_ID	new_cpu_data+CPUINFO_x86_vendor_id
 
-/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- *     (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- *     (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
 
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-
-/*
- * Number of possible pages in the lowmem region.
- *
- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
- * gas warning about overflowing shift count when gas has been compiled
- * with only a host target support using a 32-bit type for internal
- * representation.
- */
-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
+#define SIZEOF_PTREGS 17*4
 
 /*
  * Worst-case size of the kernel mapping we need to make:
@@ -158,109 +127,34 @@ ENTRY(startup_32)
 	call load_ucode_bsp
 #endif
 
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base.  The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- */
-#ifdef CONFIG_X86_PAE
-
-	/*
-	 * In PAE mode initial_page_table is statically defined to contain
-	 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
-	 * entries). The identity mapping is handled by pointing two PGD entries
-	 * to the first kernel PMD.
-	 *
-	 * Note the upper half of each PMD or PTE are always zero at this stage.
-	 */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
-	xorl %ebx,%ebx				/* %ebx is kept at zero */
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_pg_pmd), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
-	movl %ecx,(%edx)			/* Store PMD entry */
-						/* Upper half already zero */
-	addl $8,%edx
-	movl $512,%ecx
-11:
-	stosl
-	xchgl %eax,%ebx
-	stosl
-	xchgl %eax,%ebx
-	addl $0x1000,%eax
-	loop 11b
-
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-1:
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
+	/* Create early pagetables. */
+	call  mk_early_pgtbl_32
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef  CONFIG_X86_PAE
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
 	movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else	/* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-	movl $pa(__brk_base), %edi
-	movl $pa(initial_page_table), %edx
-	movl $PTE_IDENT_ATTR, %eax
-10:
-	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
-	movl %ecx,(%edx)			/* Store identity PDE entry */
-	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
-	addl $4,%edx
-	movl $1024, %ecx
-11:
-	stosl
-	addl $0x1000,%eax
-	loop 11b
-	/*
-	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
-	 */
-	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
-	cmpl %ebp,%eax
-	jb 10b
-	addl $__PAGE_OFFSET, %edi
-	movl %edi, pa(_brk_end)
-	shrl $12, %eax
-	movl %eax, pa(max_pfn_mapped)
-
-	/* Do early initialization of the fixmap area */
-	movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#else
 	movl %eax,pa(initial_page_table+0xffc)
 #endif
 
 #ifdef CONFIG_PARAVIRT
 	/* This is can only trip for a broken bootloader... */
 	cmpw $0x207, pa(boot_params + BP_version)
-	jb default_entry
+	jb .Ldefault_entry
 
 	/* Paravirt-compatible boot parameters.  Look to see what architecture
 		we're booting under. */
 	movl pa(boot_params + BP_hardware_subarch), %eax
 	cmpl $num_subarch_entries, %eax
-	jae bad_subarch
+	jae .Lbad_subarch
 
 	movl pa(subarch_entries)(,%eax,4), %eax
 	subl $__PAGE_OFFSET, %eax
 	jmp *%eax
 
-bad_subarch:
+.Lbad_subarch:
 WEAK(lguest_entry)
 WEAK(xen_entry)
 	/* Unknown implementation; there's really
@@ -270,14 +164,14 @@ WEAK(xen_entry)
 	__INITDATA
 
 subarch_entries:
-	.long default_entry		/* normal x86/PC */
+	.long .Ldefault_entry		/* normal x86/PC */
 	.long lguest_entry		/* lguest hypervisor */
 	.long xen_entry			/* Xen hypervisor */
-	.long default_entry		/* Moorestown MID */
+	.long .Ldefault_entry		/* Moorestown MID */
 num_subarch_entries = (. - subarch_entries) / 4
 .previous
 #else
-	jmp default_entry
+	jmp .Ldefault_entry
 #endif /* CONFIG_PARAVIRT */
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -289,7 +183,8 @@ num_subarch_entries = (. - subarch_entries) / 4
 ENTRY(start_cpu0)
 	movl initial_stack, %ecx
 	movl %ecx, %esp
-	jmp  *(initial_code)
+	call *(initial_code)
+1:	jmp 1b
 ENDPROC(start_cpu0)
 #endif
 
@@ -317,7 +212,7 @@ ENTRY(startup_32_smp)
 	call load_ucode_ap
 #endif
 
-default_entry:
+.Ldefault_entry:
 #define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
 			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
 			 X86_CR0_PG)
@@ -347,7 +242,7 @@ default_entry:
 	pushfl
 	popl %eax			# get EFLAGS
 	testl $X86_EFLAGS_ID,%eax	# did EFLAGS.ID remained set?
-	jz enable_paging		# hw disallowed setting of ID bit
+	jz .Lenable_paging		# hw disallowed setting of ID bit
 					# which means no CPUID and no CR4
 
 	xorl %eax,%eax
@@ -357,13 +252,13 @@ default_entry:
 	movl $1,%eax
 	cpuid
 	andl $~1,%edx			# Ignore CPUID.FPU
-	jz enable_paging		# No flags or only CPUID.FPU = no CR4
+	jz .Lenable_paging		# No flags or only CPUID.FPU = no CR4
 
 	movl pa(mmu_cr4_features),%eax
 	movl %eax,%cr4
 
 	testb $X86_CR4_PAE, %al		# check if PAE is enabled
-	jz enable_paging
+	jz .Lenable_paging
 
 	/* Check if extended functions are implemented */
 	movl $0x80000000, %eax
@@ -371,7 +266,7 @@ default_entry:
 	/* Value must be in the range 0x80000001 to 0x8000ffff */
 	subl $0x80000001, %eax
 	cmpl $(0x8000ffff-0x80000001), %eax
-	ja enable_paging
+	ja .Lenable_paging
 
 	/* Clear bogus XD_DISABLE bits */
 	call verify_cpu
@@ -380,7 +275,7 @@ default_entry:
 	cpuid
 	/* Execute Disable bit supported? */
 	btl $(X86_FEATURE_NX & 31), %edx
-	jnc enable_paging
+	jnc .Lenable_paging
 
 	/* Setup EFER (Extended Feature Enable Register) */
 	movl $MSR_EFER, %ecx
@@ -390,7 +285,7 @@ default_entry:
 	/* Make changes effective */
 	wrmsr
 
-enable_paging:
+.Lenable_paging:
 
 /*
  * Enable paging
@@ -419,7 +314,7 @@ enable_paging:
  */
 	movb $4,X86			# at least 486
 	cmpl $-1,X86_CPUID
-	je is486
+	je .Lis486
 
 	/* get vendor info */
 	xorl %eax,%eax			# call CPUID with 0 -> return vendor ID
@@ -430,7 +325,7 @@ enable_paging:
 	movl %ecx,X86_VENDOR_ID+8	# last 4 chars
 
 	orl %eax,%eax			# do we have processor info as well?
-	je is486
+	je .Lis486
 
 	movl $1,%eax		# Use the CPUID instruction to get CPU type
 	cpuid
@@ -444,7 +339,7 @@ enable_paging:
 	movb %cl,X86_MASK
 	movl %edx,X86_CAPABILITY
 
-is486:
+.Lis486:
 	movl $0x50022,%ecx	# set AM, WP, NE and MP
 	movl %cr0,%eax
 	andl $0x80000011,%eax	# Save PG,PE,ET
@@ -470,8 +365,9 @@ is486:
 	xorl %eax,%eax			# Clear LDT
 	lldt %ax
 
-	pushl $0		# fake return address for unwinder
-	jmp *(initial_code)
+	call *(initial_code)
+1:	jmp 1b
+ENDPROC(startup_32_smp)
 
 #include "verify_cpu.S"
 
@@ -662,6 +558,7 @@ ENTRY(setup_once_ref)
 __PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 #ifdef CONFIG_X86_PAE
+.globl initial_pg_pmd
 initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
@@ -709,7 +606,12 @@ ENTRY(initial_page_table)
 .data
 .balign 4
 ENTRY(initial_stack)
-	.long init_thread_union+THREAD_SIZE
+	/*
+	 * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+	 * unwinder reliably detect the end of the stack.
+	 */
+	.long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+	      TOP_OF_KERNEL_STACK_PADDING;
 
 __INITRODATA
 int_msg:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b4421cc191b0..b467b14b03eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,13 +66,8 @@ startup_64:
 	 * tables and then reload them.
 	 */
 
-	/*
-	 * Setup stack for verify_cpu(). "-8" because initial_stack is defined
-	 * this way, see below. Our best guess is a NULL ptr for stack
-	 * termination heuristics and we don't want to break anything which
-	 * might depend on it (kgdb, ...).
-	 */
-	leaq	(__end_init_task - 8)(%rip), %rsp
+	/* Set up the stack for verify_cpu(), similar to initial_stack below */
+	leaq	(__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
 
 	/* Sanitize CPU configuration */
 	call verify_cpu
@@ -117,20 +112,20 @@ startup_64:
 	movq	%rdi, %rax
 	shrq	$PGDIR_SHIFT, %rax
 
-	leaq	(4096 + _KERNPG_TABLE)(%rbx), %rdx
+	leaq	(PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
 	movq	%rdx, 0(%rbx,%rax,8)
 	movq	%rdx, 8(%rbx,%rax,8)
 
-	addq	$4096, %rdx
+	addq	$PAGE_SIZE, %rdx
 	movq	%rdi, %rax
 	shrq	$PUD_SHIFT, %rax
 	andl	$(PTRS_PER_PUD-1), %eax
-	movq	%rdx, 4096(%rbx,%rax,8)
+	movq	%rdx, PAGE_SIZE(%rbx,%rax,8)
 	incl	%eax
 	andl	$(PTRS_PER_PUD-1), %eax
-	movq	%rdx, 4096(%rbx,%rax,8)
+	movq	%rdx, PAGE_SIZE(%rbx,%rax,8)
 
-	addq	$8192, %rbx
+	addq	$PAGE_SIZE * 2, %rbx
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rdi
 	addq	$(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
@@ -147,6 +142,9 @@ startup_64:
 	decl	%ecx
 	jnz	1b
 
+	test %rbp, %rbp
+	jz .Lskip_fixup
+
 	/*
 	 * Fixup the kernel text+data virtual addresses. Note that
 	 * we might write invalid pmds, when the kernel is relocated
@@ -154,9 +152,9 @@ startup_64:
 	 * beyond _end.
 	 */
 	leaq	level2_kernel_pgt(%rip), %rdi
-	leaq	4096(%rdi), %r8
+	leaq	PAGE_SIZE(%rdi), %r8
 	/* See if it is a valid page table entry */
-1:	testb	$1, 0(%rdi)
+1:	testb	$_PAGE_PRESENT, 0(%rdi)
 	jz	2f
 	addq	%rbp, 0(%rdi)
 	/* Go to the next page */
@@ -167,6 +165,7 @@ startup_64:
 	/* Fixup phys_base */
 	addq	%rbp, phys_base(%rip)
 
+.Lskip_fixup:
 	movq	$(early_level4_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
@@ -265,13 +264,17 @@ ENTRY(secondary_startup_64)
 	movl	$MSR_GS_BASE,%ecx
 	movl	initial_gs(%rip),%eax
 	movl	initial_gs+4(%rip),%edx
-	wrmsr	
+	wrmsr
 
 	/* rsi is pointer to real mode structure with interesting info.
 	   pass it to C */
 	movq	%rsi, %rdi
-	
-	/* Finally jump to run C code and to be on real kernel address
+	jmp	start_cpu
+ENDPROC(secondary_startup_64)
+
+ENTRY(start_cpu)
+	/*
+	 * Jump to run C code and to be on a real kernel address.
 	 * Since we are running on identity-mapped space we have to jump
 	 * to the full 64bit address, this is only possible as indirect
 	 * jump.  In addition we need to ensure %cs is set so we make this
@@ -295,12 +298,14 @@ ENTRY(secondary_startup_64)
 	 *	REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
 	 *		address given in m16:64.
 	 */
-	movq	initial_code(%rip),%rax
-	pushq	$0		# fake return address to stop unwinder
+	pushq	$.Lafter_lret	# put return address on stack for unwinder
+	xorq	%rbp, %rbp	# clear frame pointer
+	movq	initial_code(%rip), %rax
 	pushq	$__KERNEL_CS	# set correct cs
 	pushq	%rax		# target address in negative space
 	lretq
-ENDPROC(secondary_startup_64)
+.Lafter_lret:
+ENDPROC(start_cpu)
 
 #include "verify_cpu.S"
 
@@ -308,15 +313,11 @@ ENDPROC(secondary_startup_64)
 /*
  * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
  * up already except stack. We just set up stack here. Then call
- * start_secondary().
+ * start_secondary() via start_cpu().
  */
 ENTRY(start_cpu0)
-	movq initial_stack(%rip),%rsp
-	movq	initial_code(%rip),%rax
-	pushq	$0		# fake return address to stop unwinder
-	pushq	$__KERNEL_CS	# set correct cs
-	pushq	%rax		# target address in negative space
-	lretq
+	movq	initial_stack(%rip), %rsp
+	jmp	start_cpu
 ENDPROC(start_cpu0)
 #endif
 
@@ -328,7 +329,11 @@ ENDPROC(start_cpu0)
 	GLOBAL(initial_gs)
 	.quad	INIT_PER_CPU_VAR(irq_stack_union)
 	GLOBAL(initial_stack)
-	.quad  init_thread_union+THREAD_SIZE-8
+	/*
+	 * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+	 * unwinder reliably detect the end of the stack.
+	 */
+	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	__FINITDATA
 
 bad_address:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 274fab99169d..89ff7af2de50 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -352,8 +352,9 @@ static int hpet_resume(struct clock_event_device *evt, int timer)
 	} else {
 		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 
+		irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
 		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
-		disable_irq(hdev->irq);
+		disable_hardirq(hdev->irq);
 		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 		enable_irq(hdev->irq);
 	}
@@ -791,7 +792,7 @@ static union hpet_lock hpet __cacheline_aligned = {
 	{ .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
 };
 
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
 {
 	unsigned long flags;
 	union hpet_lock old, new;
@@ -802,7 +803,7 @@ static cycle_t read_hpet(struct clocksource *cs)
 	 * Read HPET directly if in NMI.
 	 */
 	if (in_nmi())
-		return (cycle_t)hpet_readl(HPET_COUNTER);
+		return (u64)hpet_readl(HPET_COUNTER);
 
 	/*
 	 * Read the current state of the lock and HPET value atomically.
@@ -821,7 +822,7 @@ static cycle_t read_hpet(struct clocksource *cs)
 		WRITE_ONCE(hpet.value, new.value);
 		arch_spin_unlock(&hpet.lock);
 		local_irq_restore(flags);
-		return (cycle_t)new.value;
+		return (u64)new.value;
 	}
 	local_irq_restore(flags);
 
@@ -843,15 +844,15 @@ contended:
 		new.lockval = READ_ONCE(hpet.lockval);
 	} while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
 
-	return (cycle_t)new.value;
+	return (u64)new.value;
 }
 #else
 /*
  * For UP or 32-bit.
  */
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
 {
-	return (cycle_t)hpet_readl(HPET_COUNTER);
+	return (u64)hpet_readl(HPET_COUNTER);
 }
 #endif
 
@@ -867,7 +868,7 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
-	cycle_t t1;
+	u64 t1;
 
 	/* Start the counter */
 	hpet_restart_counter();
@@ -1051,11 +1052,11 @@ static __init int hpet_late_init(void)
 		return 0;
 
 	/* This notifier should be called after workqueue is ready */
-	ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE",
+	ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
 				hpet_cpuhp_online, NULL);
 	if (ret)
 		return ret;
-	ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL,
+	ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
 				hpet_cpuhp_dead);
 	if (ret)
 		goto err_cpuhp;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b3193f102..9c3cf0944bce 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
@@ -16,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/bitmap.h>
 #include <asm/syscalls.h>
+#include <asm/desc.h>
 
 /*
  * this changes the io permissions bitmap in the current task.
@@ -45,6 +47,16 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		memset(bitmap, 0xff, IO_BITMAP_BYTES);
 		t->io_bitmap_ptr = bitmap;
 		set_thread_flag(TIF_IO_BITMAP);
+
+		/*
+		 * Now that we have an IO bitmap, we need our TSS limit to be
+		 * correct.  It's fine if we are preempted after doing this:
+		 * with TIF_IO_BITMAP set, context switches will keep our TSS
+		 * limit correct.
+		 */
+		preempt_disable();
+		refresh_tss_limit();
+		preempt_enable();
 	}
 
 	/*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9f669fdd2010..4d8183b5f113 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -14,7 +14,6 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/irq.h>
-#include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
 #include <asm/desc.h>
@@ -265,7 +264,7 @@ void __smp_x86_platform_ipi(void)
 		x86_platform_ipi_callback();
 }
 
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
@@ -316,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
 }
 #endif
 
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 9ebd0b0e73d9..3be74fbdeff2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -15,8 +15,8 @@
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <linux/smp.h>
+#include <linux/sched/task_stack.h>
 #include <asm/io_apic.h>
-#include <asm/idle.h>
 #include <asm/apic.h>
 
 int sysctl_panic_on_stackoverflow;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 3512ba607361..275487872be2 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -9,6 +9,7 @@
 #include <linux/hardirq.h>
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
 
 static inline void __smp_irq_work_interrupt(void)
 {
@@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void)
 	irq_work_run();
 }
 
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..f73f475d0573
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,213 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package.  In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	unsigned int old_sysctl;
+	int ret;
+
+	mutex_lock(&itmt_update_mutex);
+
+	if (!sched_itmt_capable) {
+		mutex_unlock(&itmt_update_mutex);
+		return -EINVAL;
+	}
+
+	old_sysctl = sysctl_sched_itmt_enabled;
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+		x86_topology_update = true;
+		rebuild_sched_domains();
+	}
+
+	mutex_unlock(&itmt_update_mutex);
+
+	return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+	{
+		.procname	= "sched_itmt_enabled",
+		.data		= &sysctl_sched_itmt_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_itmt_update_handler,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{}
+};
+
+static struct ctl_table itmt_root_table[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= itmt_kern_table,
+	},
+	{}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+	mutex_lock(&itmt_update_mutex);
+
+	if (sched_itmt_capable) {
+		mutex_unlock(&itmt_update_mutex);
+		return 0;
+	}
+
+	itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+	if (!itmt_sysctl_header) {
+		mutex_unlock(&itmt_update_mutex);
+		return -ENOMEM;
+	}
+
+	sched_itmt_capable = true;
+
+	sysctl_sched_itmt_enabled = 1;
+
+	x86_topology_update = true;
+	rebuild_sched_domains();
+
+	mutex_unlock(&itmt_update_mutex);
+
+	return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+	mutex_lock(&itmt_update_mutex);
+
+	if (!sched_itmt_capable) {
+		mutex_unlock(&itmt_update_mutex);
+		return;
+	}
+	sched_itmt_capable = false;
+
+	if (itmt_sysctl_header) {
+		unregister_sysctl_table(itmt_sysctl_header);
+		itmt_sysctl_header = NULL;
+	}
+
+	if (sysctl_sched_itmt_enabled) {
+		/* disable sched_itmt if we are no longer ITMT capable */
+		sysctl_sched_itmt_enabled = 0;
+		x86_topology_update = true;
+		rebuild_sched_domains();
+	}
+
+	mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+	return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio:	Priority of cpu core
+ * @core_cpu:	The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+	int cpu, i = 1;
+
+	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+		int smt_prio;
+
+		/*
+		 * Ensure that the siblings are moved to the end
+		 * of the priority chain and only used when
+		 * all other high priority cpus are out of capacity.
+		 */
+		smt_prio = prio * smp_num_siblings / i;
+		per_cpu(sched_core_priority, cpu) = smt_prio;
+		i++;
+	}
+}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index fc25f698d792..c37bd0f39c70 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line)
 	 * Something went wrong. Crash the box, as something could be
 	 * corrupting the kernel.
 	 */
-	pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
-	       ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+	pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
 	BUG();
 }
 
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index bdb83e431d89..38b64587b31b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void)
 	struct dentry *dbp, *version, *data;
 	int error = -ENOMEM;
 
-	dbp = debugfs_create_dir("boot_params", NULL);
+	dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
 	if (!dbp)
 		return -ENOMEM;
 
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3407b148c240..d0a814a9d96a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 
 	struct setup_header *header;
 	int setup_sects, kern16_size, ret = 0;
-	unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+	unsigned long setup_header_size, params_cmdline_sz;
 	struct boot_params *params;
 	unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
 	unsigned long purgatory_load_addr;
-	unsigned long kernel_bufsz, kernel_memsz, kernel_align;
-	char *kernel_buf;
 	struct bzimage64_data *ldata;
 	struct kexec_entry64_regs regs64;
 	void *stack;
 	unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
 	unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+	struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+				  .top_down = true };
 
 	header = (struct setup_header *)(kernel + setup_hdr_offset);
 	setup_sects = header->setup_sects;
@@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
 				MAX_ELFCOREHDR_STR_LEN;
 	params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
-	params_misc_sz = params_cmdline_sz + efi_map_sz +
+	kbuf.bufsz = params_cmdline_sz + efi_map_sz +
 				sizeof(struct setup_data) +
 				sizeof(struct efi_setup_data);
 
-	params = kzalloc(params_misc_sz, GFP_KERNEL);
+	params = kzalloc(kbuf.bufsz, GFP_KERNEL);
 	if (!params)
 		return ERR_PTR(-ENOMEM);
 	efi_map_offset = params_cmdline_sz;
@@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 	/* Is there a limit on setup header size? */
 	memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
 
-	ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
-			       params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
-			       ULONG_MAX, 1, &bootparam_load_addr);
+	kbuf.buffer = params;
+	kbuf.memsz = kbuf.bufsz;
+	kbuf.buf_align = 16;
+	kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out_free_params;
+	bootparam_load_addr = kbuf.mem;
 	pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 bootparam_load_addr, params_misc_sz, params_misc_sz);
+		 bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
 
 	/* Load kernel */
-	kernel_buf = kernel + kern16_size;
-	kernel_bufsz =  kernel_len - kern16_size;
-	kernel_memsz = PAGE_ALIGN(header->init_size);
-	kernel_align = header->kernel_alignment;
-
-	ret = kexec_add_buffer(image, kernel_buf,
-			       kernel_bufsz, kernel_memsz, kernel_align,
-			       MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
-			       &kernel_load_addr);
+	kbuf.buffer = kernel + kern16_size;
+	kbuf.bufsz =  kernel_len - kern16_size;
+	kbuf.memsz = PAGE_ALIGN(header->init_size);
+	kbuf.buf_align = header->kernel_alignment;
+	kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+	ret = kexec_add_buffer(&kbuf);
 	if (ret)
 		goto out_free_params;
+	kernel_load_addr = kbuf.mem;
 
 	pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-		 kernel_load_addr, kernel_memsz, kernel_memsz);
+		 kernel_load_addr, kbuf.bufsz, kbuf.memsz);
 
 	/* Load initrd high */
 	if (initrd) {
-		ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
-				       PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
-				       ULONG_MAX, 1, &initrd_load_addr);
+		kbuf.buffer = initrd;
+		kbuf.bufsz = kbuf.memsz = initrd_len;
+		kbuf.buf_align = PAGE_SIZE;
+		kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+		ret = kexec_add_buffer(&kbuf);
 		if (ret)
 			goto out_free_params;
+		initrd_load_addr = kbuf.mem;
 
 		pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
 				initrd_load_addr, initrd_len, initrd_len);
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index c6ee63f927ab..d688826e5736 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -67,7 +67,7 @@
 #endif
 
 /* Ensure if the instruction can be boostable */
-extern int can_boost(kprobe_opcode_t *instruction);
+extern int can_boost(kprobe_opcode_t *instruction, void *addr);
 /* Recover instruction if given address is probed */
 extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
 					 unsigned long addr);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d9d8d16b69db..993fa4fe4f68 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/preempt.h>
+#include <linux/sched/debug.h>
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
@@ -56,7 +57,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/debugreg.h>
@@ -166,12 +167,12 @@ NOKPROBE_SYMBOL(skip_prefixes);
  * Returns non-zero if opcode is boostable.
  * RIP relative instructions are adjusted at copying time in 64 bits mode
  */
-int can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes, void *addr)
 {
 	kprobe_opcode_t opcode;
 	kprobe_opcode_t *orig_opcodes = opcodes;
 
-	if (search_exception_tables((unsigned long)opcodes))
+	if (search_exception_tables((unsigned long)addr))
 		return 0;	/* Page fault may occur on this address. */
 
 retry:
@@ -416,7 +417,7 @@ static int arch_copy_kprobe(struct kprobe *p)
 	 * __copy_instruction can modify the displacement of the instruction,
 	 * but it doesn't affect boostable check.
 	 */
-	if (can_boost(p->ainsn.insn))
+	if (can_boost(p->ainsn.insn, p->addr))
 		p->ainsn.boostable = 0;
 	else
 		p->ainsn.boostable = -1;
@@ -745,7 +746,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
 	 *	 will be the real return address, and all the rest will
 	 *	 point to kretprobe_trampoline.
 	 */
-	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+	hlist_for_each_entry(ri, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 3bb4c5f021f6..3e7c6e5a08ff 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -33,7 +33,7 @@
 #include <asm/cacheflush.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/debugreg.h>
@@ -178,7 +178,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src)
 
 	while (len < RELATIVEJUMP_SIZE) {
 		ret = __copy_instruction(dest + len, src + len);
-		if (!ret || !can_boost(dest + len))
+		if (!ret || !can_boost(dest + len, src + len))
 			return -EINVAL;
 		len += ret;
 	}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc854e39..14f65a5f938e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,7 +42,6 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/tlbflush.h>
-#include <asm/idle.h>
 #include <asm/apic.h>
 #include <asm/apicdef.h>
 #include <asm/hypervisor.h>
@@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
 		prev_state = exception_enter();
-		exit_idle();
 		kvm_async_pf_task_wait((u32)read_cr2());
 		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
-		exit_idle();
 		kvm_async_pf_task_wake((u32)read_cr2());
 		rcu_irq_exit();
 		break;
@@ -308,7 +305,7 @@ static void kvm_register_steal_time(void)
 
 static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
 
-static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 {
 	/**
 	 * This relies on __test_and_clear_bit to modify the memory
@@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 	 */
 	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
 		return;
-	apic_write(APIC_EOI, APIC_EOI_ACK);
+	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
 }
 
 static void kvm_guest_cpu_init(void)
@@ -592,6 +589,38 @@ out:
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_X86_32
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+	return !!src->preempted;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq	__per_cpu_offset(,%rdi,8), %rax;"
+"cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne	%al;"
+"ret;"
+".popsection");
+
+#endif
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -608,20 +637,11 @@ void __init kvm_spinlock_init(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
-}
 
-static __init int kvm_spinlock_init_jump(void)
-{
-	if (!kvm_para_available())
-		return 0;
-	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
-		return 0;
-
-	static_key_slow_inc(&paravirt_ticketlocks_enabled);
-	printk(KERN_INFO "KVM setup paravirtual spinlock\n");
-
-	return 0;
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+		pv_lock_ops.vcpu_is_preempted =
+			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+	}
 }
-early_initcall(kvm_spinlock_init_jump);
 
 #endif	/* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 60b9949f1e65..d88967659098 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -25,14 +25,16 @@
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
+#include <asm/kvmclock.h>
 
 static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
-static cycle_t kvm_sched_clock_offset;
+static u64 kvm_sched_clock_offset;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -49,6 +51,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 {
 	return hv_clock;
 }
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
 
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
@@ -79,10 +82,10 @@ static int kvm_set_wallclock(const struct timespec *now)
 	return -1;
 }
 
-static cycle_t kvm_clock_read(void)
+static u64 kvm_clock_read(void)
 {
 	struct pvclock_vcpu_time_info *src;
-	cycle_t ret;
+	u64 ret;
 	int cpu;
 
 	preempt_disable_notrace();
@@ -93,12 +96,12 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
-static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
 {
 	return kvm_clock_read();
 }
 
-static cycle_t kvm_sched_clock_read(void)
+static u64 kvm_sched_clock_read(void)
 {
 	return kvm_clock_read() - kvm_sched_clock_offset;
 }
@@ -107,12 +110,12 @@ static inline void kvm_sched_clock_init(bool stable)
 {
 	if (!stable) {
 		pv_time_ops.sched_clock = kvm_clock_read;
+		clear_sched_clock_stable();
 		return;
 	}
 
 	kvm_sched_clock_offset = kvm_clock_read();
 	pv_time_ops.sched_clock = kvm_sched_clock_read;
-	set_sched_clock_stable();
 
 	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
 			kvm_sched_clock_offset);
@@ -174,13 +177,14 @@ bool kvm_check_and_clear_guest_paused(void)
 	return ret;
 }
 
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
+EXPORT_SYMBOL_GPL(kvm_clock);
 
 int kvm_register_clock(char *txt)
 {
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6707039b9032..d4a15831ac58 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm)
 }
 
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int size)
 {
 	struct ldt_struct *new_ldt;
-	int alloc_size;
+	unsigned int alloc_size;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 
 	paravirt_free_ldt(ldt->entries, ldt->size);
 	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
+		vfree_atomic(ldt->entries);
 	else
 		free_page((unsigned long)ldt->entries);
 	kfree(ldt);
@@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
 static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
 	struct mm_struct *mm = current->mm;
+	struct ldt_struct *new_ldt, *old_ldt;
+	unsigned int oldsize, newsize;
+	struct user_desc ldt_info;
 	struct desc_struct ldt;
 	int error;
-	struct user_desc ldt_info;
-	int oldsize, newsize;
-	struct ldt_struct *new_ldt, *old_ldt;
 
 	error = -EINVAL;
 	if (bytecount != sizeof(ldt_info))
@@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
 	old_ldt = mm->context.ldt;
 	oldsize = old_ldt ? old_ldt->size : 0;
-	newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+	newsize = max(ldt_info.entry_number + 1, oldsize);
 
 	error = -ENOMEM;
 	new_ldt = alloc_ldt_struct(newsize);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 8c1f218926d7..857cdbd02867 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -194,19 +194,22 @@ static int arch_update_purgatory(struct kimage *image)
 
 	/* Setup copying of backup region */
 	if (image->type == KEXEC_TYPE_CRASH) {
-		ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_dest",
 				&image->arch.backup_load_addr,
 				sizeof(image->arch.backup_load_addr), 0);
 		if (ret)
 			return ret;
 
-		ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_src",
 				&image->arch.backup_src_start,
 				sizeof(image->arch.backup_src_start), 0);
 		if (ret)
 			return ret;
 
-		ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+		ret = kexec_purgatory_get_set_symbol(image,
+				"purgatory_backup_sz",
 				&image->arch.backup_src_sz,
 				sizeof(image->arch.backup_src_sz), 0);
 		if (ret)
@@ -328,7 +331,7 @@ void machine_kexec(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
-	VMCOREINFO_SYMBOL(phys_base);
+	VMCOREINFO_NUMBER(phys_base);
 	VMCOREINFO_SYMBOL(init_level4_pgt);
 
 #ifdef CONFIG_NUMA
@@ -337,9 +340,7 @@ void arch_crash_save_vmcoreinfo(void)
 #endif
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
 			      kaslr_offset());
-	VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET);
-	VMCOREINFO_VMALLOC_START(VMALLOC_START);
-	VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
+	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7f3550acde1b..ef688804f80d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -44,6 +44,7 @@
 #include <asm/msr.h>
 
 static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
 
 static ssize_t msr_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos)
@@ -180,7 +181,7 @@ static const struct file_operations msr_fops = {
 	.compat_ioctl = msr_ioctl,
 };
 
-static int msr_device_create(int cpu)
+static int msr_device_create(unsigned int cpu)
 {
 	struct device *dev;
 
@@ -189,34 +190,12 @@ static int msr_device_create(int cpu)
 	return PTR_ERR_OR_ZERO(dev);
 }
 
-static void msr_device_destroy(int cpu)
+static int msr_device_destroy(unsigned int cpu)
 {
 	device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+	return 0;
 }
 
-static int msr_class_cpu_callback(struct notifier_block *nfb,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	int err = 0;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-		err = msr_device_create(cpu);
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-		msr_device_destroy(cpu);
-		break;
-	}
-	return notifier_from_errno(err);
-}
-
-static struct notifier_block __refdata msr_class_cpu_notifier = {
-	.notifier_call = msr_class_cpu_callback,
-};
-
 static char *msr_devnode(struct device *dev, umode_t *mode)
 {
 	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
@@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode)
 
 static int __init msr_init(void)
 {
-	int i, err = 0;
-	i = 0;
+	int err;
 
 	if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
 		pr_err("unable to get major %d for msr\n", MSR_MAJOR);
-		err = -EBUSY;
-		goto out;
+		return -EBUSY;
 	}
 	msr_class = class_create(THIS_MODULE, "msr");
 	if (IS_ERR(msr_class)) {
@@ -239,44 +216,27 @@ static int __init msr_init(void)
 	}
 	msr_class->devnode = msr_devnode;
 
-	cpu_notifier_register_begin();
-	for_each_online_cpu(i) {
-		err = msr_device_create(i);
-		if (err != 0)
-			goto out_class;
-	}
-	__register_hotcpu_notifier(&msr_class_cpu_notifier);
-	cpu_notifier_register_done();
-
-	err = 0;
-	goto out;
+	err  = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+				 msr_device_create, msr_device_destroy);
+	if (err < 0)
+		goto out_class;
+	cpuhp_msr_state = err;
+	return 0;
 
 out_class:
-	i = 0;
-	for_each_online_cpu(i)
-		msr_device_destroy(i);
-	cpu_notifier_register_done();
 	class_destroy(msr_class);
 out_chrdev:
 	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-out:
 	return err;
 }
+module_init(msr_init);
 
 static void __exit msr_exit(void)
 {
-	int cpu = 0;
-
-	cpu_notifier_register_begin();
-	for_each_online_cpu(cpu)
-		msr_device_destroy(cpu);
+	cpuhp_remove_state(cpuhp_msr_state);
 	class_destroy(msr_class);
 	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-	__unregister_hotcpu_notifier(&msr_class_cpu_notifier);
-	cpu_notifier_register_done();
 }
-
-module_init(msr_init);
 module_exit(msr_exit)
 
 MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bfe4d6c96fbd..a723ae9440ab 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/sched/debug.h>
 #include <linux/nmi.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
@@ -20,6 +21,7 @@
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/sched/clock.h>
 
 #if defined(CONFIG_EDAC)
 #include <linux/edac.h>
@@ -164,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
 	spin_lock_irqsave(&desc->lock, flags);
 
 	/*
-	 * most handlers of type NMI_UNKNOWN never return because
-	 * they just assume the NMI is theirs.  Just a sanity check
-	 * to manage expectations
+	 * Indicate if there are multiple registrations on the
+	 * internal NMI handler call chains (SERR and IO_CHECK).
 	 */
-	WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
 	WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
 	WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
 
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a003b793..8f2d1c9d43a8 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock)
 {
 	native_queued_spin_unlock(lock);
 }
-
 PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
 
 bool pv_is_native_spin_unlock(void)
@@ -21,15 +20,25 @@ bool pv_is_native_spin_unlock(void)
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+	return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+	return pv_lock_ops.vcpu_is_preempted.func ==
+		__raw_callee_save___native_vcpu_is_preempted;
+}
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
 	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
+	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
-
-struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bbf3d5933eaa..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.cpuid = native_cpuid,
 	.get_debugreg = native_get_debugreg,
 	.set_debugreg = native_set_debugreg,
-	.clts = native_clts,
 	.read_cr0 = native_read_cr0,
 	.write_cr0 = native_write_cr0,
 	.read_cr4 = native_read_cr4,
@@ -426,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 	.pmd_clear = native_pmd_clear,
 #endif
 	.set_pud = native_set_pud,
+	.set_pud_at = native_set_pud_at,
 
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 920c6ae08592..553acbbb4d32 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 }
 
 extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
@@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
-		PATCH_SITE(pv_cpu_ops, clts);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
@@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 				end   = end_pv_lock_ops_queued_spin_unlock;
 				goto patch_site;
 			}
+			goto patch_default;
+
+		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+			if (pv_is_native_vcpu_is_preempted()) {
+				start = start_pv_lock_ops_vcpu_is_preempted;
+				end   = end_pv_lock_ops_vcpu_is_preempted;
+				goto patch_site;
+			}
+			goto patch_default;
 #endif
 
 	default:
+patch_default: __maybe_unused
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
 
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index bb3840cedb4f..11aaf1eaa0e4 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
 DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 }
 
 extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
@@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
-		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
@@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 				end   = end_pv_lock_ops_queued_spin_unlock;
 				goto patch_site;
 			}
+			goto patch_default;
+
+		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+			if (pv_is_native_vcpu_is_preempted()) {
+				start = start_pv_lock_ops_vcpu_is_preempted;
+				end   = end_pv_lock_ops_vcpu_is_preempted;
+				goto patch_site;
+			}
+			goto patch_default;
 #endif
 
 	default:
+patch_default: __maybe_unused
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
 
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5d400ba1349d..0c150c06fa5a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -296,7 +296,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
 	/* were we called with bad_dma_address? */
 	badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
-	if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
+	if (unlikely(dma_addr < badend)) {
 		WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
 		       "address 0x%Lx\n", dma_addr);
 		return;
@@ -478,7 +478,7 @@ static void calgary_free_coherent(struct device *dev, size_t size,
 	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-static struct dma_map_ops calgary_dma_ops = {
+static const struct dma_map_ops calgary_dma_ops = {
 	.alloc = calgary_alloc_coherent,
 	.free = calgary_free_coherent,
 	.map_sg = calgary_map_sg,
@@ -1177,7 +1177,7 @@ static int __init calgary_init(void)
 		tbl = find_iommu_table(&dev->dev);
 
 		if (translation_enabled(tbl))
-			dev->dev.archdata.dma_ops = &calgary_dma_ops;
+			dev->dev.dma_ops = &calgary_dma_ops;
 	}
 
 	return ret;
@@ -1201,7 +1201,7 @@ error:
 		calgary_disable_translation(dev);
 		calgary_free_bus(dev);
 		pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
-		dev->dev.archdata.dma_ops = NULL;
+		dev->dev.dma_ops = NULL;
 	} while (1);
 
 	return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..3a216ec869cd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
 
 static int forbid_dac __read_mostly;
 
-struct dma_map_ops *dma_ops = &nommu_dma_ops;
+const struct dma_map_ops *dma_ops = &nommu_dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -91,7 +91,8 @@ again:
 	page = NULL;
 	/* CMA can be used only in the context which permits sleeping */
 	if (gfpflags_allow_blocking(flag)) {
-		page = dma_alloc_from_contiguous(dev, count, get_order(size));
+		page = dma_alloc_from_contiguous(dev, count, get_order(size),
+						 flag);
 		if (page && page_to_phys(page) + size > dma_mask) {
 			dma_release_from_contiguous(dev, page, count);
 			page = NULL;
@@ -214,7 +215,7 @@ early_param("iommu", iommu_setup);
 
 int dma_supported(struct device *dev, u64 mask)
 {
-	struct dma_map_ops *ops = get_dma_ops(dev);
+	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 00e71ce396a8..a88952ef371c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -88,7 +88,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
 	flush_write_buffers();
 }
 
-struct dma_map_ops nommu_dma_ops = {
+const struct dma_map_ops nommu_dma_ops = {
 	.alloc			= dma_generic_alloc_coherent,
 	.free			= dma_generic_free_coherent,
 	.map_sg			= nommu_map_sg,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index b47edb8f5256..1e23577e17cf 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
 		dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
 }
 
-static struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc = x86_swiotlb_alloc_coherent,
 	.free = x86_swiotlb_free_coherent,
@@ -68,12 +68,10 @@ static struct dma_map_ops swiotlb_dma_ops = {
  */
 int __init pci_swiotlb_detect_override(void)
 {
-	int use_swiotlb = swiotlb | swiotlb_force;
-
-	if (swiotlb_force)
+	if (swiotlb_force == SWIOTLB_FORCE)
 		swiotlb = 1;
 
-	return use_swiotlb;
+	return swiotlb;
 }
 IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
 		  pci_xen_swiotlb_detect,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index da8cb987b973..587d887f7f17 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -1,6 +1,7 @@
 #include <linux/errno.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/perf_event.h>
 #include <linux/bug.h>
 #include <linux/stddef.h>
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 24a50301f150..91271122f0df 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -6,6 +6,7 @@
 
 void __init x86_early_init_platform_quirks(void)
 {
+	x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
 	x86_platform.legacy.rtc = 1;
 	x86_platform.legacy.reserve_bios_regions = 0;
 	x86_platform.legacy.devices.pnpbios = 1;
@@ -16,10 +17,14 @@ void __init x86_early_init_platform_quirks(void)
 		break;
 	case X86_SUBARCH_XEN:
 	case X86_SUBARCH_LGUEST:
+		x86_platform.legacy.devices.pnpbios = 0;
+		x86_platform.legacy.rtc = 0;
+		break;
 	case X86_SUBARCH_INTEL_MID:
 	case X86_SUBARCH_CE4100:
 		x86_platform.legacy.devices.pnpbios = 0;
 		x86_platform.legacy.rtc = 0;
+		x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
 		break;
 	}
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..f67591561711 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,10 @@
 #include <linux/prctl.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pm.h>
@@ -23,8 +27,7 @@
 #include <asm/cpu.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
-#include <asm/idle.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mwait.h>
 #include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
@@ -33,6 +36,7 @@
 #include <asm/mce.h>
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
+#include <asm/desc.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -65,22 +69,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
-#ifdef CONFIG_X86_64
-static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
-	atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-#endif
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
 
 /*
  * this gets called so that we can store lazy state into memory and copy the
@@ -227,6 +217,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 */
 		memcpy(tss->io_bitmap, next->io_bitmap_ptr,
 		       max(prev->io_bitmap_max, next->io_bitmap_max));
+
+		/*
+		 * Make sure that the TSS limit is correct for the CPU
+		 * to notice the IO bitmap.
+		 */
+		refresh_tss_limit();
 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 		/*
 		 * Clear any possible leftover bits:
@@ -251,39 +247,10 @@ static inline void play_dead(void)
 }
 #endif
 
-#ifdef CONFIG_X86_64
-void enter_idle(void)
-{
-	this_cpu_write(is_idle, 1);
-	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
-	if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
-		return;
-	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
-	/* idle loop has pid 0 */
-	if (current->pid)
-		return;
-	__exit_idle();
-}
-#endif
-
 void arch_cpu_idle_enter(void)
 {
+	tsc_verify_tsc_adjust(false);
 	local_touch_nmi();
-	enter_idle();
-}
-
-void arch_cpu_idle_exit(void)
-{
-	__exit_idle();
 }
 
 void arch_cpu_idle_dead(void)
@@ -336,59 +303,33 @@ void stop_this_cpu(void *dummy)
 		halt();
 }
 
-bool amd_e400_c1e_detected;
-EXPORT_SYMBOL(amd_e400_c1e_detected);
-
-static cpumask_var_t amd_e400_c1e_mask;
-
-void amd_e400_remove_cpu(int cpu)
-{
-	if (amd_e400_c1e_mask != NULL)
-		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
-}
-
 /*
- * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
  */
 static void amd_e400_idle(void)
 {
-	if (!amd_e400_c1e_detected) {
-		u32 lo, hi;
-
-		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-
-		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			amd_e400_c1e_detected = true;
-			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
-				mark_tsc_unstable("TSC halt in AMD C1E");
-			pr_info("System has AMD C1E enabled\n");
-		}
+	/*
+	 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+	 * gets set after static_cpu_has() places have been converted via
+	 * alternatives.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+		default_idle();
+		return;
 	}
 
-	if (amd_e400_c1e_detected) {
-		int cpu = smp_processor_id();
-
-		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
-			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
-			/* Force broadcast so ACPI can not interfere. */
-			tick_broadcast_force();
-			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
-		}
-		tick_broadcast_enter();
+	tick_broadcast_enter();
 
-		default_idle();
+	default_idle();
 
-		/*
-		 * The switch back from broadcast mode needs to be
-		 * called with interrupts disabled.
-		 */
-		local_irq_disable();
-		tick_broadcast_exit();
-		local_irq_enable();
-	} else
-		default_idle();
+	/*
+	 * The switch back from broadcast mode needs to be called with
+	 * interrupts disabled.
+	 */
+	local_irq_disable();
+	tick_broadcast_exit();
+	local_irq_enable();
 }
 
 /*
@@ -448,8 +389,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
 	if (x86_idle || boot_option_idle_override == IDLE_POLL)
 		return;
 
-	if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
-		/* E400: APIC timer interrupt does not wake up CPU from C1e */
+	if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
 		pr_info("using AMD E400 aware idle routine\n");
 		x86_idle = amd_e400_idle;
 	} else if (prefer_mwait_c1_over_halt(c)) {
@@ -459,11 +399,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
 		x86_idle = default_idle;
 }
 
-void __init init_amd_e400_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
+{
+	if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+		pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+		local_irq_disable();
+		tick_broadcast_force();
+		local_irq_enable();
+	}
+}
+
+void __init arch_post_acpi_subsys_init(void)
 {
-	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
-	if (x86_idle == amd_e400_idle)
-		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+	u32 lo, hi;
+
+	if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+		return;
+
+	/*
+	 * AMD E400 detection needs to happen after ACPI has been enabled. If
+	 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+	 * MSR_K8_INT_PENDING_MSG.
+	 */
+	rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+	if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+		return;
+
+	boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+		mark_tsc_unstable("TSC halt in AMD C1E");
+	pr_info("System has AMD C1E enabled\n");
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8efdc4c..4c818f8bc135 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -12,6 +12,8 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -49,11 +51,11 @@
 
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
-#include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
+#include <asm/intel_rdt.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -72,10 +74,9 @@ void __show_regs(struct pt_regs *regs, int all)
 		savesegment(gs, gs);
 	}
 
-	printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
-			(u16)regs->cs, regs->ip, regs->flags,
-			smp_processor_id());
-	print_symbol("EIP is at %s\n", regs->ip);
+	printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
+	printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
+		smp_processor_id());
 
 	printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
 		regs->ax, regs->bx, regs->cx, regs->dx);
@@ -232,11 +233,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	fpu_switch_t fpu_switch;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+	switch_fpu_prepare(prev_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -295,9 +295,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_fpu, fpu_switch);
+	switch_fpu_finish(next_fpu, cpu);
 
 	this_cpu_write(current_task, next_p);
 
+	/* Load the Intel cache allocation PQR MSR. */
+	intel_rdt_sched_in();
+
 	return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..d6b784a5520d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -17,6 +17,8 @@
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -44,12 +46,12 @@
 #include <asm/desc.h>
 #include <asm/proto.h>
 #include <asm/ia32.h>
-#include <asm/idle.h>
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
+#include <asm/intel_rdt.h>
 
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
@@ -61,10 +63,15 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
 
-	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
-	printk_address(regs->ip);
-	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
-			regs->sp, regs->flags);
+	printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
+		(void *)regs->ip);
+	printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
+		regs->sp, regs->flags);
+	if (regs->orig_ax != -1)
+		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+	else
+		pr_cont("\n");
+
 	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       regs->ax, regs->bx, regs->cx);
 	printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -265,9 +272,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned prev_fsindex, prev_gsindex;
-	fpu_switch_t fpu_switch;
 
-	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+	switch_fpu_prepare(prev_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -417,7 +423,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		prev->gsbase = 0;
 	prev->gsindex = prev_gsindex;
 
-	switch_fpu_finish(next_fpu, fpu_switch);
+	switch_fpu_finish(next_fpu, cpu);
 
 	/*
 	 * Switch the PDA and FPU contexts.
@@ -473,6 +479,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 			loadsegment(ss, __KERNEL_DS);
 	}
 
+	/* Load the Intel cache allocation PQR MSR. */
+	intel_rdt_sched_in();
+
 	return prev_p;
 }
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0e63c0267f99..2364b23ea3e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/errno.h>
@@ -24,7 +25,7 @@
 #include <linux/export.h>
 #include <linux/context_tracking.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/fpu/internal.h>
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5b2cc889ce34..5c3f6d6a5078 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -21,6 +21,8 @@
 #include <linux/sched.h>
 #include <linux/gfp.h>
 #include <linux/bootmem.h>
+#include <linux/nmi.h>
+
 #include <asm/fixmap.h>
 #include <asm/pvclock.h>
 
@@ -71,10 +73,10 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
 	return flags & valid_flags;
 }
 
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	unsigned version;
-	cycle_t ret;
+	u64 ret;
 	u64 last;
 	u8 flags;
 
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e244c19a2451..067f9813fd2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
 		},
 	},
+	{	/* Handle problems with rebooting on ASUS EeeBook X205TA */
+		.callback = set_acpi_reboot,
+		.ident = "ASUS EeeBook X205TA",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+		},
+	},
+	{	/* Handle problems with rebooting on ASUS EeeBook X205TAW */
+		.callback = set_acpi_reboot,
+		.ident = "ASUS EeeBook X205TAW",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+		},
+	},
 
 	/* Certec */
 	{       /* Handle problems with rebooting on Certec BPC600 */
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 79c6311cd912..5b21cb7d84d6 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now)
 	unsigned int status, year, mon, day, hour, min, sec, century = 0;
 	unsigned long flags;
 
+	/*
+	 * If pm_trace abused the RTC as storage, set the timespec to 0,
+	 * which tells the caller that this RTC value is unusable.
+	 */
+	if (!pm_trace_rtc_valid()) {
+		now->tv_sec = now->tv_nsec = 0;
+		return;
+	}
+
 	spin_lock_irqsave(&rtc_lock, flags);
 
 	/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0e8ba7..4bf0c8926a1c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void)
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
 		/*
-		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+		 * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+		 * as old kexec-tools loads bzImage below that, unless
+		 * "crashkernel=size[KMG],high" is specified.
 		 */
 		crash_base = memblock_find_in_range(CRASH_ALIGN,
 						    high ? CRASH_ADDR_HIGH_MAX
@@ -985,6 +987,30 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+	/*
+	 * Memory used by the kernel cannot be hot-removed because Linux
+	 * cannot migrate the kernel pages. When memory hotplug is
+	 * enabled, we should prevent memblock from allocating memory
+	 * for the kernel.
+	 *
+	 * ACPI SRAT records all hotpluggable memory ranges. But before
+	 * SRAT is parsed, we don't know about it.
+	 *
+	 * The kernel image is loaded into memory at very early time. We
+	 * cannot prevent this anyway. So on NUMA system, we set any
+	 * node the kernel resides in as un-hotpluggable.
+	 *
+	 * Since on modern servers, one node could have double-digit
+	 * gigabytes memory, we can assume the memory around the kernel
+	 * image is also un-hotpluggable. So before SRAT is parsed, just
+	 * allocate memory near the kernel image to try the best to keep
+	 * the kernel away from hotpluggable memory.
+	 */
+	if (movable_node_is_enabled())
+		memblock_set_bottom_up(true);
+#endif
+
 	x86_report_nx();
 
 	/* after early param, so could get panic from serial */
@@ -1152,6 +1178,20 @@ void __init setup_arch(char **cmdline_p)
 	/* Allocate bigger log buffer */
 	setup_log_buf(1);
 
+	if (efi_enabled(EFI_BOOT)) {
+		switch (boot_params.secure_boot) {
+		case efi_secureboot_mode_disabled:
+			pr_info("Secure boot disabled\n");
+			break;
+		case efi_secureboot_mode_enabled:
+			pr_info("Secure boot enabled\n");
+			break;
+		default:
+			pr_info("Secure boot could not be determined\n");
+			break;
+		}
+	}
+
 	reserve_initrd();
 
 	acpi_table_upgrade();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2bbd27f89802..9820d6d977c6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,7 +1,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/percpu.h>
@@ -12,6 +12,7 @@
 #include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
+#include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 763af1d0de64..cc30a74e4adb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/kernel.h>
@@ -845,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 		       task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		       me->comm, me->pid, where, frame,
 		       regs->ip, regs->sp, regs->orig_ax);
-		print_vma_addr(" in ", regs->ip);
+		print_vma_addr(KERN_CONT " in ", regs->ip);
 		pr_cont("\n");
 	}
 
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index ec1f756f9dc9..71beb28600d4 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
 
 				if (from->si_signo == SIGSEGV) {
 					if (from->si_code == SEGV_BNDERR) {
-						compat_uptr_t lower = (unsigned long)&to->si_lower;
-						compat_uptr_t upper = (unsigned long)&to->si_upper;
+						compat_uptr_t lower = (unsigned long)from->si_lower;
+						compat_uptr_t upper = (unsigned long)from->si_upper;
 						put_user_ex(lower, &to->si_lower);
 						put_user_ex(upper, &to->si_upper);
 					}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index c00cb64bc0a1..d3c66a15bbde 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -259,18 +259,16 @@ static inline void __smp_reschedule_interrupt(void)
 	scheduler_ipi();
 }
 
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
 {
-	irq_enter();
 	ack_APIC_irq();
 	__smp_reschedule_interrupt();
-	irq_exit();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
 }
 
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	/*
 	 * Need to call irq_enter() before calling the trace point.
@@ -294,14 +292,15 @@ static inline void __smp_call_function_interrupt(void)
 	inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -316,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void)
 	inc_irq_stat(irq_call_count);
 }
 
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42f5eb7b4f6c..bd1f1ad35284 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -45,6 +45,9 @@
 #include <linux/smp.h>
 #include <linux/export.h>
 #include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
@@ -58,7 +61,6 @@
 #include <asm/desc.h>
 #include <asm/nmi.h>
 #include <asm/irq.h>
-#include <asm/idle.h>
 #include <asm/realmode.h>
 #include <asm/cpu.h>
 #include <asm/numa.h>
@@ -104,11 +106,21 @@ static unsigned int max_physical_pkg_id __read_mostly;
 unsigned int __max_logical_packages __read_mostly;
 EXPORT_SYMBOL(__max_logical_packages);
 static unsigned int logical_packages __read_mostly;
-static bool logical_packages_frozen __read_mostly;
 
 /* Maximum number of SMT threads on any online core */
 int __max_smt_threads __read_mostly;
 
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+	int retval = x86_topology_update;
+
+	x86_topology_update = false;
+	return retval;
+}
+
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -263,9 +275,14 @@ static void notrace start_secondary(void *unused)
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
 
-int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+/**
+ * topology_update_package_map - Update the physical to logical package map
+ * @pkg:	The physical package id as retrieved via CPUID
+ * @cpu:	The cpu for which this is updated
+ */
+int topology_update_package_map(unsigned int pkg, unsigned int cpu)
 {
-	unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+	unsigned int new;
 
 	/* Called from early boot ? */
 	if (!physical_package_map)
@@ -278,16 +295,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
 	if (test_and_set_bit(pkg, physical_package_map))
 		goto found;
 
-	if (logical_packages_frozen) {
-		physical_to_logical_pkg[pkg] = -1;
-		pr_warn("APIC(%x) Package %u exceeds logical package max\n",
-			apicid, pkg);
+	if (logical_packages >= __max_logical_packages) {
+		pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n",
+			logical_packages, cpu, __max_logical_packages);
 		return -ENOSPC;
 	}
 
 	new = logical_packages++;
-	pr_info("APIC(%x) Converting physical %u to logical package %u\n",
-		apicid, pkg, new);
+	if (new != pkg) {
+		pr_info("CPU %u Converting physical %u to logical package %u\n",
+			cpu, pkg, new);
+	}
 	physical_to_logical_pkg[pkg] = new;
 
 found:
@@ -308,9 +326,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
 }
 EXPORT_SYMBOL(topology_phys_to_logical_pkg);
 
-static void __init smp_init_package_map(void)
+static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu)
 {
-	unsigned int ncpus, cpu;
+	unsigned int ncpus;
 	size_t size;
 
 	/*
@@ -355,27 +373,9 @@ static void __init smp_init_package_map(void)
 	size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
 	physical_package_map = kzalloc(size, GFP_KERNEL);
 
-	for_each_present_cpu(cpu) {
-		unsigned int apicid = apic->cpu_present_to_apicid(cpu);
-
-		if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
-			continue;
-		if (!topology_update_package_map(apicid, cpu))
-			continue;
-		pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
-		per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
-		set_cpu_possible(cpu, false);
-		set_cpu_present(cpu, false);
-	}
-
-	if (logical_packages > __max_logical_packages) {
-		pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
-			logical_packages, __max_logical_packages);
-		logical_packages_frozen = true;
-		__max_logical_packages  = logical_packages;
-	}
-
 	pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+	topology_update_package_map(c->phys_proc_id, cpu);
 }
 
 void __init smp_store_boot_cpu_info(void)
@@ -385,7 +385,7 @@ void __init smp_store_boot_cpu_info(void)
 
 	*c = boot_cpu_data;
 	c->cpu_index = id;
-	smp_init_package_map();
+	smp_init_package_map(c, id);
 }
 
 /*
@@ -436,9 +436,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 		int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
 
 		if (c->phys_proc_id == o->phys_proc_id &&
-		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
-		    c->cpu_core_id == o->cpu_core_id)
-			return topology_sane(c, o, "smt");
+		    per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
+			if (c->cpu_core_id == o->cpu_core_id)
+				return topology_sane(c, o, "smt");
+
+			if ((c->cu_id != 0xff) &&
+			    (o->cu_id != 0xff) &&
+			    (c->cu_id == o->cu_id))
+				return topology_sane(c, o, "smt");
+		}
 
 	} else if (c->phys_proc_id == o->phys_proc_id &&
 		   c->cpu_core_id == o->cpu_core_id) {
@@ -471,22 +477,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 	return false;
 }
 
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+	return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+	return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+	return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
 static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 #endif
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 #endif
 	{ NULL, },
 };
 
 static struct sched_domain_topology_level x86_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
 #endif
 #ifdef CONFIG_SCHED_MC
-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
 #endif
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
@@ -821,14 +847,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-void smp_announce(void)
-{
-	int num_nodes = num_online_nodes();
-
-	printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
-	       num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
-}
-
 /* reduce the number of lines printed when booting a large cpu count system */
 static void announce_cpu(int cpu, int apicid)
 {
@@ -964,9 +982,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	int cpu0_nmi_registered = 0;
 	unsigned long timeout;
 
-	idle->thread.sp = (unsigned long) (((struct pt_regs *)
-			  (THREAD_SIZE +  task_stack_page(idle))) - 1);
-
+	idle->thread.sp = (unsigned long)task_pt_regs(idle);
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
 	initial_stack  = idle->thread.sp;
@@ -1111,7 +1127,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 		return err;
 
 	/* the FPU context is blank, nobody can own it */
-	__cpu_disable_lazy_restore(cpu);
+	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
 
 	common_cpu_up(cpu, tidle);
 
@@ -1331,11 +1347,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	default_setup_apic_routing();
 	cpu0_logical_apicid = apic_bsp_setup(false);
 
-	pr_info("CPU%d: ", 0);
+	pr_info("CPU0: ");
 	print_cpu_info(&cpu_data(0));
 
-	if (is_uv_system())
-		uv_system_init();
+	uv_system_init();
 
 	set_mtrr_aps_delayed_init();
 
@@ -1456,15 +1471,15 @@ __init void prefill_possible_map(void)
 		possible = i;
 	}
 
+	nr_cpu_ids = possible;
+
 	pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
 		possible, max_t(int, possible - num_processors, 0));
 
+	reset_cpu_possible_mask();
+
 	for (i = 0; i < possible; i++)
 		set_cpu_possible(i, true);
-	for (; i < NR_CPUS; i++)
-		set_cpu_possible(i, false);
-
-	nr_cpu_ids = possible;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1575,7 +1590,6 @@ void play_dead_common(void)
 {
 	idle_task_exit();
 	reset_lazy_tlbstate();
-	amd_e400_remove_cpu(raw_smp_processor_id());
 
 	/* Ack it */
 	(void)cpu_report_death();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 0653788026e2..8e2b79b88e51 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,6 +4,8 @@
  *  Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
 #include <linux/stacktrace.h>
 #include <linux/export.h>
 #include <linux/uaccess.h>
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index a23ce84a3f6c..f07f83b3611b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -2,6 +2,7 @@
  * x86 single-step support code, common to 32-bit and 64-bit.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/mm.h>
 #include <linux/ptrace.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a55ed63b9f91..50215a4b9347 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,5 +1,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
 #include <linux/syscalls.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 8402907825b0..b868fa1b812b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -408,7 +408,7 @@ static __init int tboot_late_init(void)
 	tboot_create_trampoline();
 
 	atomic_set(&ap_wfs_count, 0);
-	cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL,
+	cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
 			  tboot_dying_cpu);
 #ifdef CONFIG_DEBUG_FS
 	debugfs_create_file("tboot_log", S_IRUSR,
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
deleted file mode 100644
index 27538f183c3b..000000000000
--- a/arch/x86/kernel/test_nx.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * test_nx.c: functional test for NX functionality
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/asm.h>
-
-extern int rodata_test_data;
-
-/*
- * This file checks 4 things:
- * 1) Check if the stack is not executable
- * 2) Check if kmalloc memory is not executable
- * 3) Check if the .rodata section is not executable
- * 4) Check if the .data section of a module is not executable
- *
- * To do this, the test code tries to execute memory in stack/kmalloc/etc,
- * and then checks if the expected trap happens.
- *
- * Sadly, this implies having a dynamic exception handling table entry.
- * ... which can be done (and will make Rusty cry)... but it can only
- * be done in a stand-alone module with only 1 entry total.
- * (otherwise we'd have to sort and that's just too messy)
- */
-
-
-
-/*
- * We want to set up an exception handling point on our stack,
- * which means a variable value. This function is rather dirty
- * and walks the exception table of the module, looking for a magic
- * marker and replaces it with a specific function.
- */
-static void fudze_exception_table(void *marker, void *new)
-{
-	struct module *mod = THIS_MODULE;
-	struct exception_table_entry *extable;
-
-	/*
-	 * Note: This module has only 1 exception table entry,
-	 * so searching and sorting is not needed. If that changes,
-	 * this would be the place to search and re-sort the exception
-	 * table.
-	 */
-	if (mod->num_exentries > 1) {
-		printk(KERN_ERR "test_nx: too many exception table entries!\n");
-		printk(KERN_ERR "test_nx: test results are not reliable.\n");
-		return;
-	}
-	extable = (struct exception_table_entry *)mod->extable;
-	extable[0].insn = (unsigned long)new;
-}
-
-
-/*
- * exception tables get their symbols translated so we need
- * to use a fake function to put in there, which we can then
- * replace at runtime.
- */
-void foo_label(void);
-
-/*
- * returns 0 for not-executable, negative for executable
- *
- * Note: we cannot allow this function to be inlined, because
- * that would give us more than 1 exception table entry.
- * This in turn would break the assumptions above.
- */
-static noinline int test_address(void *address)
-{
-	unsigned long result;
-
-	/* Set up an exception table entry for our address */
-	fudze_exception_table(&foo_label, address);
-	result = 1;
-	asm volatile(
-		"foo_label:\n"
-		"0:	call *%[fake_code]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	mov %[zero], %[rslt]\n"
-		"	ret\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
-	);
-	/* change the exception table back for the next round */
-	fudze_exception_table(address, &foo_label);
-
-	if (result)
-		return -ENODEV;
-	return 0;
-}
-
-static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
-
-static int test_NX(void)
-{
-	int ret = 0;
-	/* 0xC3 is the opcode for "ret" */
-	char stackcode[] = {0xC3, 0x90, 0 };
-	char *heap;
-
-	test_data = 0xC3;
-
-	printk(KERN_INFO "Testing NX protection\n");
-
-	/* Test 1: check if the stack is not executable */
-	if (test_address(&stackcode)) {
-		printk(KERN_ERR "test_nx: stack was executable\n");
-		ret = -ENODEV;
-	}
-
-
-	/* Test 2: Check if the heap is executable */
-	heap = kmalloc(64, GFP_KERNEL);
-	if (!heap)
-		return -ENOMEM;
-	heap[0] = 0xC3; /* opcode for "ret" */
-
-	if (test_address(heap)) {
-		printk(KERN_ERR "test_nx: heap was executable\n");
-		ret = -ENODEV;
-	}
-	kfree(heap);
-
-	/*
-	 * The following 2 tests currently fail, this needs to get fixed
-	 * Until then, don't run them to avoid too many people getting scared
-	 * by the error message
-	 */
-
-	/* Test 3: Check if the .rodata section is executable */
-	if (rodata_test_data != 0xC3) {
-		printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
-		ret = -ENODEV;
-	} else if (test_address(&rodata_test_data)) {
-		printk(KERN_ERR "test_nx: .rodata section is executable\n");
-		ret = -ENODEV;
-	}
-
-#if 0
-	/* Test 4: Check if the .data section of a module is executable */
-	if (test_address(&test_data)) {
-		printk(KERN_ERR "test_nx: .data section is executable\n");
-		ret = -ENODEV;
-	}
-
-#endif
-	return ret;
-}
-
-static void test_exit(void)
-{
-}
-
-module_init(test_NX);
-module_exit(test_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the NX infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index 222e84e2432e..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
-	unsigned long result;
-	unsigned long start, end;
-
-	/* test 1: read the value */
-	/* If this test fails, some previous testrun has clobbered the state */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
-		return -ENODEV;
-	}
-
-	/* test 2: write to the variable; this should fault */
-	/*
-	 * If this test fails, we managed to overwrite the data
-	 *
-	 * This is written in assembly to be able to catch the
-	 * exception that is supposed to happen in the correct
-	 * case
-	 */
-
-	result = 1;
-	asm volatile(
-		"0:	mov %[zero],(%[rodata_test])\n"
-		"	mov %[zero], %[rslt]\n"
-		"1:\n"
-		".section .fixup,\"ax\"\n"
-		"2:	jmp 1b\n"
-		".previous\n"
-		_ASM_EXTABLE(0b,2b)
-		: [rslt] "=r" (result)
-		: [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
-	);
-
-
-	if (!result) {
-		printk(KERN_ERR "rodata_test: test data was not read only\n");
-		return -ENODEV;
-	}
-
-	/* test 3: check the value hasn't changed */
-	/* If this test fails, we managed to overwrite the data */
-	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
-		return -ENODEV;
-	}
-	/* test 4: check if the rodata section is 4Kb aligned */
-	start = (unsigned long)__start_rodata;
-	end = (unsigned long)__end_rodata;
-	if (start & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
-		return -ENODEV;
-	}
-	if (end & (PAGE_SIZE - 1)) {
-		printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
-		return -ENODEV;
-	}
-
-	return 0;
-}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9692a5e9fdab..6c8934406dc9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -5,7 +5,7 @@
 #include <linux/regset.h>
 #include <linux/syscalls.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..15515132bf0d 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -34,7 +34,7 @@ static void switch_idt(void *arg)
 	local_irq_restore(flags);
 }
 
-void trace_irq_vector_regfunc(void)
+int trace_irq_vector_regfunc(void)
 {
 	mutex_lock(&irq_vector_mutex);
 	if (!trace_irq_vector_refcount) {
@@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void)
 	}
 	trace_irq_vector_refcount++;
 	mutex_unlock(&irq_vector_mutex);
+	return 0;
 }
 
 void trace_irq_vector_unregfunc(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd4e3d4d3625..4e496379a871 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -29,6 +29,7 @@
 #include <linux/errno.h>
 #include <linux/kexec.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/timer.h>
 #include <linux/init.h>
 #include <linux/bug.h>
@@ -254,7 +255,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 		pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
 			tsk->comm, tsk->pid, str,
 			regs->ip, regs->sp, error_code);
-		print_vma_addr(" in ", regs->ip);
+		print_vma_addr(KERN_CONT " in ", regs->ip);
 		pr_cont("\n");
 	}
 
@@ -518,7 +519,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
 			tsk->comm, task_pid_nr(tsk),
 			regs->ip, regs->sp, error_code);
-		print_vma_addr(" in ", regs->ip);
+		print_vma_addr(KERN_CONT " in ", regs->ip);
 		pr_cont("\n");
 	}
 
@@ -563,11 +564,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	 * as we may switch to the interrupt stack.
 	 */
 	debug_stack_usage_inc();
-	preempt_disable();
 	cond_local_irq_enable(regs);
 	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
 	cond_local_irq_disable(regs);
-	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 exit:
 	ist_exit(regs);
@@ -742,14 +741,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_inc();
 
 	/* It's safe to allow irq's after DR6 has been saved */
-	preempt_disable();
 	cond_local_irq_enable(regs);
 
 	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
 		cond_local_irq_disable(regs);
-		preempt_enable_no_resched();
 		debug_stack_usage_dec();
 		goto exit;
 	}
@@ -769,7 +766,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
 		send_sigtrap(tsk, regs, error_code, si_code);
 	cond_local_irq_disable(regs);
-	preempt_enable_no_resched();
 	debug_stack_usage_dec();
 
 exit:
@@ -853,6 +849,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
+	unsigned long cr0;
+
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 
 #ifdef CONFIG_MATH_EMULATION
@@ -866,10 +864,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 		return;
 	}
 #endif
-	fpu__restore(&current->thread.fpu); /* interrupts still off */
-#ifdef CONFIG_X86_32
-	cond_local_irq_enable(regs);
-#endif
+
+	/* This should not happen. */
+	cr0 = read_cr0();
+	if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+		/* Try to fix it up and carry on. */
+		write_cr0(cr0 & ~X86_CR0_TS);
+	} else {
+		/*
+		 * Something terrible happened, and we're better off trying
+		 * to kill the task than getting stuck in a never-ending
+		 * loop of #NM faults.
+		 */
+		die("unexpected #NM exception", regs, error_code);
+	}
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 46b2f41f8b05..714dfba6a1e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -2,6 +2,7 @@
 
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/timer.h>
@@ -326,9 +327,16 @@ unsigned long long sched_clock(void)
 {
 	return paravirt_sched_clock();
 }
+
+bool using_native_sched_clock(void)
+{
+	return pv_time_ops.sched_clock == native_sched_clock;
+}
 #else
 unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
 #endif
 
 int check_tsc_unstable(void)
@@ -694,6 +702,7 @@ unsigned long native_calibrate_tsc(void)
 			crystal_khz = 24000;	/* 24.0 MHz */
 			break;
 		case INTEL_FAM6_SKYLAKE_X:
+		case INTEL_FAM6_ATOM_DENVERTON:
 			crystal_khz = 25000;	/* 25.0 MHz */
 			break;
 		case INTEL_FAM6_ATOM_GOLDMONT:
@@ -702,6 +711,20 @@ unsigned long native_calibrate_tsc(void)
 		}
 	}
 
+	/*
+	 * TSC frequency determined by CPUID is a "hardware reported"
+	 * frequency and is the most accurate one so far we have. This
+	 * is considered a known frequency.
+	 */
+	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+	/*
+	 * For Atom SoCs TSC is the only reliable clocksource.
+	 * Mark TSC reliable so no watchdog on it.
+	 */
+	if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+		setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
 	return crystal_khz * ebx_numerator / eax_denominator;
 }
 
@@ -1043,18 +1066,20 @@ static void detect_art(void)
 	if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
 		return;
 
-	cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
-	      &art_to_tsc_numerator, unused, unused+1);
-
-	/* Don't enable ART in a VM, non-stop TSC required */
+	/* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
 	    !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
-	    art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+	    !boot_cpu_has(X86_FEATURE_TSC_ADJUST))
 		return;
 
-	if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+	cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+	      &art_to_tsc_numerator, unused, unused+1);
+
+	if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
 		return;
 
+	rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+
 	/* Make this sticky over multiple CPU init calls */
 	setup_force_cpu_cap(X86_FEATURE_ART);
 }
@@ -1064,6 +1089,11 @@ static void detect_art(void)
 
 static struct clocksource clocksource_tsc;
 
+static void tsc_resume(struct clocksource *cs)
+{
+	tsc_verify_tsc_adjust(true);
+}
+
 /*
  * We used to compare the TSC to the cycle_last value in the clocksource
  * structure to avoid a nasty time-warp. This can be observed in a
@@ -1080,9 +1110,21 @@ static struct clocksource clocksource_tsc;
  * checking the result of read_tsc() - cycle_last for being negative.
  * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
  */
-static cycle_t read_tsc(struct clocksource *cs)
+static u64 read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)rdtsc_ordered();
+	return (u64)rdtsc_ordered();
+}
+
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+	if (tsc_unstable)
+		return;
+
+	tsc_unstable = 1;
+	if (using_native_sched_clock())
+		clear_sched_clock_stable();
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to clocksource watchdog\n");
 }
 
 /*
@@ -1096,22 +1138,26 @@ static struct clocksource clocksource_tsc = {
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 	.archdata               = { .vclock_mode = VCLOCK_TSC },
+	.resume			= tsc_resume,
+	.mark_unstable		= tsc_cs_mark_unstable,
 };
 
 void mark_tsc_unstable(char *reason)
 {
-	if (!tsc_unstable) {
-		tsc_unstable = 1;
+	if (tsc_unstable)
+		return;
+
+	tsc_unstable = 1;
+	if (using_native_sched_clock())
 		clear_sched_clock_stable();
-		disable_sched_clock_irqtime();
-		pr_info("Marking TSC unstable due to %s\n", reason);
-		/* Change only the rating, when not registered */
-		if (clocksource_tsc.mult)
-			clocksource_mark_unstable(&clocksource_tsc);
-		else {
-			clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
-			clocksource_tsc.rating = 0;
-		}
+	disable_sched_clock_irqtime();
+	pr_info("Marking TSC unstable due to %s\n", reason);
+	/* Change only the rating, when not registered */
+	if (clocksource_tsc.mult) {
+		clocksource_mark_unstable(&clocksource_tsc);
+	} else {
+		clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
+		clocksource_tsc.rating = 0;
 	}
 }
 
@@ -1170,7 +1216,7 @@ int unsynchronized_tsc(void)
 /*
  * Convert ART to TSC given numerator/denominator found in detect_art()
  */
-struct system_counterval_t convert_art_to_tsc(cycle_t art)
+struct system_counterval_t convert_art_to_tsc(u64 art)
 {
 	u64 tmp, res, rem;
 
@@ -1283,10 +1329,12 @@ static int __init init_tsc_clocksource(void)
 		clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
 
 	/*
-	 * Trust the results of the earlier calibration on systems
-	 * exporting a reliable TSC.
+	 * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+	 * the refined calibration and directly register it as a clocksource.
 	 */
-	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+	if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+		if (boot_cpu_has(X86_FEATURE_ART))
+			art_related_clocksource = &clocksource_tsc;
 		clocksource_register_khz(&clocksource_tsc, tsc_khz);
 		return 0;
 	}
@@ -1333,6 +1381,9 @@ void __init tsc_init(void)
 		(unsigned long)cpu_khz / 1000,
 		(unsigned long)cpu_khz % 1000);
 
+	/* Sanitize TSC ADJUST before cyc2ns gets initialized */
+	tsc_store_and_check_tsc_adjust(true);
+
 	/*
 	 * Secondary CPUs do not run through tsc_init(), so set up
 	 * all the scale factors for all CPUs, assuming the same
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 0fe720d64fef..19afdbd7d0a7 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 	lapic_timer_frequency = (freq * 1000) / HZ;
 #endif
+
+	/*
+	 * TSC frequency determined by MSR is always considered "known"
+	 * because it is reported by HW.
+	 * Another fact is that on MSR capable platforms, PIT/HPET is
+	 * generally not available so calibration won't work at all.
+	 */
+	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+	/*
+	 * Unfortunately there is no way for hardware to tell whether the
+	 * TSC is reliable.  We were told by silicon design team that TSC
+	 * on Atom SoCs are always "reliable". TSC is also the only
+	 * reliable clocksource on these SoCs (HPET is either not present
+	 * or not functional) so mark TSC reliable which removes the
+	 * requirement for a watchdog clocksource.
+	 */
+	setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
 	return res;
 }
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 78083bf23ed1..728f75378475 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -14,18 +14,166 @@
  * ( The serial nature of the boot logic and the CPU hotplug lock
  *   protects against more than 2 CPUs entering this code. )
  */
+#include <linux/topology.h>
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
 #include <asm/tsc.h>
 
+struct tsc_adjust {
+	s64		bootval;
+	s64		adjusted;
+	unsigned long	nextcheck;
+	bool		warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+	struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+	s64 curval;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+		return;
+
+	/* Rate limit the MSR check */
+	if (!resume && time_before(jiffies, adj->nextcheck))
+		return;
+
+	adj->nextcheck = jiffies + HZ;
+
+	rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+	if (adj->adjusted == curval)
+		return;
+
+	/* Restore the original value */
+	wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+	if (!adj->warned || resume) {
+		pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+			smp_processor_id(), adj->adjusted, curval);
+		adj->warned = true;
+	}
+}
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+				   unsigned int cpu, bool bootcpu)
+{
+	/*
+	 * First online CPU in a package stores the boot value in the
+	 * adjustment value. This value might change later via the sync
+	 * mechanism. If that fails we still can yell about boot values not
+	 * being consistent.
+	 *
+	 * On the boot cpu we just force set the ADJUST value to 0 if it's
+	 * non zero. We don't do that on non boot cpus because physical
+	 * hotplug should have set the ADJUST register to a value > 0 so
+	 * the TSC is in sync with the already running cpus.
+	 *
+	 * But we always force positive ADJUST values. Otherwise the TSC
+	 * deadline timer creates an interrupt storm. We also have to
+	 * prevent values > 0x7FFFFFFF as those wreckage the timer as well.
+	 */
+	if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
+	    (bootval > 0x7FFFFFFF)) {
+		pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
+			bootval);
+		wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+		bootval = 0;
+	}
+	cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+	s64 bootval;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+		return false;
+
+	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+	cur->bootval = bootval;
+	cur->nextcheck = jiffies + HZ;
+	tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+	return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+	struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+	unsigned int refcpu, cpu = smp_processor_id();
+	struct cpumask *mask;
+	s64 bootval;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+		return false;
+
+	rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+	cur->bootval = bootval;
+	cur->nextcheck = jiffies + HZ;
+	cur->warned = false;
+
+	/*
+	 * Check whether this CPU is the first in a package to come up. In
+	 * this case do not check the boot value against another package
+	 * because the new package might have been physically hotplugged,
+	 * where TSC_ADJUST is expected to be different. When called on the
+	 * boot CPU topology_core_cpumask() might not be available yet.
+	 */
+	mask = topology_core_cpumask(cpu);
+	refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+	if (refcpu >= nr_cpu_ids) {
+		tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+				       bootcpu);
+		return false;
+	}
+
+	ref = per_cpu_ptr(&tsc_adjust, refcpu);
+	/*
+	 * Compare the boot value and complain if it differs in the
+	 * package.
+	 */
+	if (bootval != ref->bootval) {
+		pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
+			refcpu, ref->bootval, cpu, bootval);
+	}
+	/*
+	 * The TSC_ADJUST values in a package must be the same. If the boot
+	 * value on this newly upcoming CPU differs from the adjustment
+	 * value of the already online CPU in this package, set it to that
+	 * adjusted value.
+	 */
+	if (bootval != ref->adjusted) {
+		pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
+			refcpu, ref->adjusted, cpu, bootval);
+		cur->adjusted = ref->adjusted;
+		wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+	}
+	/*
+	 * We have the TSCs forced to be in sync on this package. Skip sync
+	 * test:
+	 */
+	return true;
+}
+
 /*
  * Entry/exit counters that make sure that both CPUs
  * run the measurement code at once:
  */
 static atomic_t start_count;
 static atomic_t stop_count;
+static atomic_t skip_test;
+static atomic_t test_runs;
 
 /*
  * We use a raw spinlock in this exceptional case, because
@@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 static cycles_t last_tsc;
 static cycles_t max_warp;
 static int nr_warps;
+static int random_warps;
 
 /*
  * TSC-warp measurement loop running on both CPUs.  This is not called
  * if there is no TSC.
  */
-static void check_tsc_warp(unsigned int timeout)
+static cycles_t check_tsc_warp(unsigned int timeout)
 {
-	cycles_t start, now, prev, end;
-	int i;
+	cycles_t start, now, prev, end, cur_max_warp = 0;
+	int i, cur_warps = 0;
 
 	start = rdtsc_ordered();
 	/*
@@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout)
 		if (unlikely(prev > now)) {
 			arch_spin_lock(&sync_lock);
 			max_warp = max(max_warp, prev - now);
+			cur_max_warp = max_warp;
+			/*
+			 * Check whether this bounces back and forth. Only
+			 * one CPU should observe time going backwards.
+			 */
+			if (cur_warps != nr_warps)
+				random_warps++;
 			nr_warps++;
+			cur_warps = nr_warps;
 			arch_spin_unlock(&sync_lock);
 		}
 	}
 	WARN(!(now-start),
 		"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
 			now-start, end-start);
+	return cur_max_warp;
 }
 
 /*
@@ -128,23 +286,27 @@ void check_tsc_sync_source(int cpu)
 	if (unsynchronized_tsc())
 		return;
 
-	if (tsc_clocksource_reliable) {
-		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
-			pr_info(
-			"Skipped synchronization checks as TSC is reliable.\n");
-		return;
-	}
-
 	/*
-	 * Reset it - in case this is a second bootup:
+	 * Set the maximum number of test runs to
+	 *  1 if the CPU does not provide the TSC_ADJUST MSR
+	 *  3 if the MSR is available, so the target can try to adjust
 	 */
-	atomic_set(&stop_count, 0);
-
+	if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+		atomic_set(&test_runs, 1);
+	else
+		atomic_set(&test_runs, 3);
+retry:
 	/*
-	 * Wait for the target to arrive:
+	 * Wait for the target to start or to skip the test:
 	 */
-	while (atomic_read(&start_count) != cpus-1)
+	while (atomic_read(&start_count) != cpus - 1) {
+		if (atomic_read(&skip_test) > 0) {
+			atomic_set(&skip_test, 0);
+			return;
+		}
 		cpu_relax();
+	}
+
 	/*
 	 * Trigger the target to continue into the measurement too:
 	 */
@@ -155,21 +317,35 @@ void check_tsc_sync_source(int cpu)
 	while (atomic_read(&stop_count) != cpus-1)
 		cpu_relax();
 
-	if (nr_warps) {
+	/*
+	 * If the test was successful set the number of runs to zero and
+	 * stop. If not, decrement the number of runs an check if we can
+	 * retry. In case of random warps no retry is attempted.
+	 */
+	if (!nr_warps) {
+		atomic_set(&test_runs, 0);
+
+		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+			smp_processor_id(), cpu);
+
+	} else if (atomic_dec_and_test(&test_runs) || random_warps) {
+		/* Force it to 0 if random warps brought us here */
+		atomic_set(&test_runs, 0);
+
 		pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
 			smp_processor_id(), cpu);
 		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
 			   "turning off TSC clock.\n", max_warp);
+		if (random_warps)
+			pr_warning("TSC warped randomly between CPUs\n");
 		mark_tsc_unstable("check_tsc_sync_source failed");
-	} else {
-		pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
-			smp_processor_id(), cpu);
 	}
 
 	/*
 	 * Reset it - just in case we boot another CPU later:
 	 */
 	atomic_set(&start_count, 0);
+	random_warps = 0;
 	nr_warps = 0;
 	max_warp = 0;
 	last_tsc = 0;
@@ -178,6 +354,12 @@ void check_tsc_sync_source(int cpu)
 	 * Let the target continue with the bootup:
 	 */
 	atomic_inc(&stop_count);
+
+	/*
+	 * Retry, if there is a chance to do so.
+	 */
+	if (atomic_read(&test_runs) > 0)
+		goto retry;
 }
 
 /*
@@ -185,12 +367,30 @@ void check_tsc_sync_source(int cpu)
  */
 void check_tsc_sync_target(void)
 {
+	struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+	unsigned int cpu = smp_processor_id();
+	cycles_t cur_max_warp, gbl_max_warp;
 	int cpus = 2;
 
 	/* Also aborts if there is no TSC. */
-	if (unsynchronized_tsc() || tsc_clocksource_reliable)
+	if (unsynchronized_tsc())
+		return;
+
+	/*
+	 * Store, verify and sanitize the TSC adjust register. If
+	 * successful skip the test.
+	 *
+	 * The test is also skipped when the TSC is marked reliable. This
+	 * is true for SoCs which have no fallback clocksource. On these
+	 * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+	 * register might have been wreckaged by the BIOS..
+	 */
+	if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
+		atomic_inc(&skip_test);
 		return;
+	}
 
+retry:
 	/*
 	 * Register this CPU's participation and wait for the
 	 * source CPU to start the measurement:
@@ -199,7 +399,12 @@ void check_tsc_sync_target(void)
 	while (atomic_read(&start_count) != cpus)
 		cpu_relax();
 
-	check_tsc_warp(loop_timeout(smp_processor_id()));
+	cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+	/*
+	 * Store the maximum observed warp value for a potential retry:
+	 */
+	gbl_max_warp = max_warp;
 
 	/*
 	 * Ok, we are done:
@@ -211,4 +416,61 @@ void check_tsc_sync_target(void)
 	 */
 	while (atomic_read(&stop_count) != cpus)
 		cpu_relax();
+
+	/*
+	 * Reset it for the next sync test:
+	 */
+	atomic_set(&stop_count, 0);
+
+	/*
+	 * Check the number of remaining test runs. If not zero, the test
+	 * failed and a retry with adjusted TSC is possible. If zero the
+	 * test was either successful or failed terminally.
+	 */
+	if (!atomic_read(&test_runs))
+		return;
+
+	/*
+	 * If the warp value of this CPU is 0, then the other CPU
+	 * observed time going backwards so this TSC was ahead and
+	 * needs to move backwards.
+	 */
+	if (!cur_max_warp)
+		cur_max_warp = -gbl_max_warp;
+
+	/*
+	 * Add the result to the previous adjustment value.
+	 *
+	 * The adjustement value is slightly off by the overhead of the
+	 * sync mechanism (observed values are ~200 TSC cycles), but this
+	 * really depends on CPU, node distance and frequency. So
+	 * compensating for this is hard to get right. Experiments show
+	 * that the warp is not longer detectable when the observed warp
+	 * value is used. In the worst case the adjustment needs to go
+	 * through a 3rd run for fine tuning.
+	 */
+	cur->adjusted += cur_max_warp;
+
+	/*
+	 * TSC deadline timer stops working or creates an interrupt storm
+	 * with adjust values < 0 and > x07ffffff.
+	 *
+	 * To allow adjust values > 0x7FFFFFFF we need to disable the
+	 * deadline timer and use the local APIC timer, but that requires
+	 * more intrusive changes and we do not have any useful information
+	 * from Intel about the underlying HW wreckage yet.
+	 */
+	if (cur->adjusted < 0)
+		cur->adjusted = 0;
+	if (cur->adjusted > 0x7FFFFFFF)
+		cur->adjusted = 0x7FFFFFFF;
+
+	pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+		cpu, cur_max_warp, cur->adjusted);
+
+	wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+	goto retry;
+
 }
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a2456d4d286a..08339262b666 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,4 +1,6 @@
 #include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
 #include <asm/ptrace.h>
 #include <asm/bitops.h>
 #include <asm/stacktrace.h>
@@ -6,6 +8,52 @@
 
 #define FRAME_HEADER_SIZE (sizeof(long) * 2)
 
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x)			\
+({							\
+	unsigned long val;				\
+	if (task == current)				\
+		val = READ_ONCE(x);			\
+	else						\
+		val = READ_ONCE_NOCHECK(x);		\
+	val;						\
+})
+
+static void unwind_dump(struct unwind_state *state, unsigned long *sp)
+{
+	static bool dumped_before = false;
+	bool prev_zero, zero = false;
+	unsigned long word;
+
+	if (dumped_before)
+		return;
+
+	dumped_before = true;
+
+	printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n",
+			state->stack_info.type, state->stack_info.next_sp,
+			state->stack_mask, state->graph_idx);
+
+	for (sp = state->orig_sp; sp < state->stack_info.end; sp++) {
+		word = READ_ONCE_NOCHECK(*sp);
+
+		prev_zero = zero;
+		zero = word == 0;
+
+		if (zero) {
+			if (!prev_zero)
+				printk_deferred("%p: %016x ...\n", sp, 0);
+			continue;
+		}
+
+		printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word);
+	}
+}
+
 unsigned long unwind_get_return_address(struct unwind_state *state)
 {
 	unsigned long addr;
@@ -14,17 +62,84 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
 	if (unwind_done(state))
 		return 0;
 
-	addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+	if (state->regs && user_mode(state->regs))
+		return 0;
+
+	addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+	addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr,
 				     addr_p);
 
 	return __kernel_text_address(addr) ? addr : 0;
 }
 EXPORT_SYMBOL_GPL(unwind_get_return_address);
 
+static size_t regs_size(struct pt_regs *regs)
+{
+	/* x86_32 regs from kernel mode are two words shorter: */
+	if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+		return sizeof(*regs) - 2*sizeof(long);
+
+	return sizeof(*regs);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+	unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2;
+	unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS;
+
+	/*
+	 * We have to check for the last task frame at two different locations
+	 * because gcc can occasionally decide to realign the stack pointer and
+	 * change the offset of the stack frame in the prologue of a function
+	 * called by head/entry code.  Examples:
+	 *
+	 * <start_secondary>:
+	 *      push   %edi
+	 *      lea    0x8(%esp),%edi
+	 *      and    $0xfffffff8,%esp
+	 *      pushl  -0x4(%edi)
+	 *      push   %ebp
+	 *      mov    %esp,%ebp
+	 *
+	 * <x86_64_start_kernel>:
+	 *      lea    0x8(%rsp),%r10
+	 *      and    $0xfffffffffffffff0,%rsp
+	 *      pushq  -0x8(%r10)
+	 *      push   %rbp
+	 *      mov    %rsp,%rbp
+	 *
+	 * Note that after aligning the stack, it pushes a duplicate copy of
+	 * the return address before pushing the frame pointer.
+	 */
+	return (state->bp == last_bp ||
+		(state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack.  See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+	unsigned long regs = (unsigned long)bp;
+
+	if (!(regs & 0x1))
+		return NULL;
+
+	return (struct pt_regs *)(regs & ~0x1);
+}
+
 static bool update_stack_state(struct unwind_state *state, void *addr,
 			       size_t len)
 {
 	struct stack_info *info = &state->stack_info;
+	enum stack_type orig_type = info->type;
 
 	/*
 	 * If addr isn't on the current stack, switch to the next one.
@@ -38,31 +153,137 @@ static bool update_stack_state(struct unwind_state *state, void *addr,
 				   &state->stack_mask))
 			return false;
 
+	if (!state->orig_sp || info->type != orig_type)
+		state->orig_sp = addr;
+
 	return true;
 }
 
 bool unwind_next_frame(struct unwind_state *state)
 {
-	unsigned long *next_bp;
+	struct pt_regs *regs;
+	unsigned long *next_bp, *next_frame;
+	size_t next_len;
+	enum stack_type prev_type = state->stack_info.type;
 
 	if (unwind_done(state))
 		return false;
 
-	next_bp = (unsigned long *)*state->bp;
+	/* have we reached the end? */
+	if (state->regs && user_mode(state->regs))
+		goto the_end;
+
+	if (is_last_task_frame(state)) {
+		regs = task_pt_regs(state->task);
+
+		/*
+		 * kthreads (other than the boot CPU's idle thread) have some
+		 * partial regs at the end of their stack which were placed
+		 * there by copy_thread_tls().  But the regs don't have any
+		 * useful information, so we can skip them.
+		 *
+		 * This user_mode() check is slightly broader than a PF_KTHREAD
+		 * check because it also catches the awkward situation where a
+		 * newly forked kthread transitions into a user task by calling
+		 * do_execve(), which eventually clears PF_KTHREAD.
+		 */
+		if (!user_mode(regs))
+			goto the_end;
+
+		/*
+		 * We're almost at the end, but not quite: there's still the
+		 * syscall regs frame.  Entry code doesn't encode the regs
+		 * pointer for syscalls, so we have to set it manually.
+		 */
+		state->regs = regs;
+		state->bp = NULL;
+		return true;
+	}
+
+	/* get the next frame pointer */
+	if (state->regs)
+		next_bp = (unsigned long *)state->regs->bp;
+	else
+		next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp);
+
+	/* is the next frame pointer an encoded pointer to pt_regs? */
+	regs = decode_frame_pointer(next_bp);
+	if (regs) {
+		next_frame = (unsigned long *)regs;
+		next_len = sizeof(*regs);
+	} else {
+		next_frame = next_bp;
+		next_len = FRAME_HEADER_SIZE;
+	}
 
 	/* make sure the next frame's data is accessible */
-	if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
-		return false;
+	if (!update_stack_state(state, next_frame, next_len)) {
+		/*
+		 * Don't warn on bad regs->bp.  An interrupt in entry code
+		 * might cause a false positive warning.
+		 */
+		if (state->regs)
+			goto the_end;
+
+		goto bad_address;
+	}
+
+	/* Make sure it only unwinds up and doesn't overlap the last frame: */
+	if (state->stack_info.type == prev_type) {
+		if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs))
+			goto bad_address;
+
+		if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE)
+			goto bad_address;
+	}
 
 	/* move to the next frame */
-	state->bp = next_bp;
+	if (regs) {
+		state->regs = regs;
+		state->bp = NULL;
+	} else {
+		state->bp = next_bp;
+		state->regs = NULL;
+	}
+
 	return true;
+
+bad_address:
+	/*
+	 * When unwinding a non-current task, the task might actually be
+	 * running on another CPU, in which case it could be modifying its
+	 * stack while we're reading it.  This is generally not a problem and
+	 * can be ignored as long as the caller understands that unwinding
+	 * another task will not always succeed.
+	 */
+	if (state->task != current)
+		goto the_end;
+
+	if (state->regs) {
+		printk_deferred_once(KERN_WARNING
+			"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+			state->regs, state->task->comm,
+			state->task->pid, next_frame);
+		unwind_dump(state, (unsigned long *)state->regs);
+	} else {
+		printk_deferred_once(KERN_WARNING
+			"WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+			state->bp, state->task->comm,
+			state->task->pid, next_frame);
+		unwind_dump(state, state->bp);
+	}
+the_end:
+	state->stack_info.type = STACK_TYPE_UNKNOWN;
+	return false;
 }
 EXPORT_SYMBOL_GPL(unwind_next_frame);
 
 void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		    struct pt_regs *regs, unsigned long *first_frame)
 {
+	unsigned long *bp, *frame;
+	size_t len;
+
 	memset(state, 0, sizeof(*state));
 	state->task = task;
 
@@ -73,12 +294,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 	}
 
 	/* set up the starting stack frame */
-	state->bp = get_frame_pointer(task, regs);
+	bp = get_frame_pointer(task, regs);
+	regs = decode_frame_pointer(bp);
+	if (regs) {
+		state->regs = regs;
+		frame = (unsigned long *)regs;
+		len = sizeof(*regs);
+	} else {
+		state->bp = bp;
+		frame = bp;
+		len = FRAME_HEADER_SIZE;
+	}
 
 	/* initialize stack info and make sure the frame data is accessible */
-	get_stack_info(state->bp, state->task, &state->stack_info,
+	get_stack_info(frame, state->task, &state->stack_info,
 		       &state->stack_mask);
-	update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+	update_stack_state(state, frame, len);
 
 	/*
 	 * The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 01f30e56f99e..23ee89ce59a9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -35,6 +35,7 @@
 #include <linux/interrupt.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/kernel.h>
 #include <linux/signal.h>
 #include <linux/string.h>
@@ -47,7 +48,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
@@ -160,11 +161,12 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 
 static void mark_screen_rdonly(struct mm_struct *mm)
 {
+	struct vm_area_struct *vma;
+	spinlock_t *ptl;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
-	spinlock_t *ptl;
 	int i;
 
 	down_write(&mm->mmap_sem);
@@ -177,7 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	pmd = pmd_offset(pud, 0xA0000);
 
 	if (pmd_trans_huge(*pmd)) {
-		struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+		vma = find_vma(mm, 0xA0000);
 		split_huge_pmd(vma, pmd, 0xA0000);
 	}
 	if (pmd_none_or_clear_bad(pmd))
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index dbf67f64d5ec..c74ae9ce8dc4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -91,10 +91,10 @@ SECTIONS
 	/* Text and read-only data */
 	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
 		_text = .;
+		_stext = .;
 		/* bootstrapping code */
 		HEAD_TEXT
 		. = ALIGN(8);
-		_stext = .;
 		TEXT_TEXT
 		SCHED_TEXT
 		CPUIDLE_TEXT
@@ -345,7 +345,6 @@ SECTIONS
 	DISCARDS
 	/DISCARD/ : {
 		*(.eh_frame)
-		*(__func_stack_frame_non_standard)
 	}
 }
 
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 0bd9f1287f39..11a93f005268 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -89,7 +89,6 @@ struct x86_cpuinit_ops x86_cpuinit = {
 };
 
 static void default_nmi_init(void) { };
-static int default_i8042_detect(void) { return 1; };
 
 struct x86_platform_ops x86_platform __ro_after_init = {
 	.calibrate_cpu			= native_calibrate_cpu,
@@ -100,7 +99,6 @@ struct x86_platform_ops x86_platform __ro_after_init = {
 	.is_untracked_pat_range		= is_ISA_range,
 	.nmi_init			= default_nmi_init,
 	.get_nmi_reason			= default_get_nmi_reason,
-	.i8042_detect			= default_i8042_detect,
 	.save_sched_clock_state 	= tsc_save_sched_clock_state,
 	.restore_sched_clock_state 	= tsc_restore_sched_clock_state,
 };