summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/apei.c3
-rw-r--r--arch/x86/kernel/acpi/boot.c34
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S9
-rw-r--r--arch/x86/kernel/alternative.c15
-rw-r--r--arch/x86/kernel/amd_gart_64.c3
-rw-r--r--arch/x86/kernel/amd_nb.c164
-rw-r--r--arch/x86/kernel/apb_timer.c6
-rw-r--r--arch/x86/kernel/apic/apic.c109
-rw-r--r--arch/x86/kernel/apic/io_apic.c9
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/vector.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c549
-rw-r--r--arch/x86/kernel/apm_32.c13
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c9
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c71
-rw-r--r--arch/x86/kernel/cpu/bugs.c26
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/centaur.c5
-rw-r--r--arch/x86/kernel/cpu/common.c140
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c58
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c23
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c403
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1115
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_schemata.c245
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c284
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c373
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c41
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c93
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c613
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c190
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c877
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c184
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c73
-rw-r--r--arch/x86/kernel/cpu/scattered.c60
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c85
-rw-r--r--arch/x86/kernel/cpuid.c73
-rw-r--r--arch/x86/kernel/crash.c37
-rw-r--r--arch/x86/kernel/crash_dump_32.c2
-rw-r--r--arch/x86/kernel/doublefault.c3
-rw-r--r--arch/x86/kernel/dumpstack.c70
-rw-r--r--arch/x86/kernel/dumpstack_32.c57
-rw-r--r--arch/x86/kernel/dumpstack_64.c80
-rw-r--r--arch/x86/kernel/e820.c22
-rw-r--r--arch/x86/kernel/fpu/bugs.c7
-rw-r--r--arch/x86/kernel/fpu/core.c81
-rw-r--r--arch/x86/kernel/fpu/init.c137
-rw-r--r--arch/x86/kernel/fpu/regset.c1
-rw-r--r--arch/x86/kernel/fpu/signal.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c20
-rw-r--r--arch/x86/kernel/ftrace.c8
-rw-r--r--arch/x86/kernel/head32.c62
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_32.S168
-rw-r--r--arch/x86/kernel/head_64.S61
-rw-r--r--arch/x86/kernel/hpet.c21
-rw-r--r--arch/x86/kernel/ioport.c12
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irq_work.c5
-rw-r--r--arch/x86/kernel/itmt.c213
-rw-r--r--arch/x86/kernel/jump_label.c3
-rw-r--r--arch/x86/kernel/kdebugfs.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c48
-rw-r--r--arch/x86/kernel/kprobes/common.h2
-rw-r--r--arch/x86/kernel/kprobes/core.c11
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/kvm.c56
-rw-r--r--arch/x86/kernel/kvmclock.c18
-rw-r--r--arch/x86/kernel/ldt.c14
-rw-r--r--arch/x86/kernel/machine_kexec_64.c15
-rw-r--r--arch/x86/kernel/msr.c68
-rw-r--r--arch/x86/kernel/nmi.c8
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c17
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c14
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c14
-rw-r--r--arch/x86/kernel/pci-calgary_64.c8
-rw-r--r--arch/x86/kernel/pci-dma.c7
-rw-r--r--arch/x86/kernel/pci-nommu.c2
-rw-r--r--arch/x86/kernel/pci-swiotlb.c8
-rw-r--r--arch/x86/kernel/perf_regs.c1
-rw-r--r--arch/x86/kernel/platform-quirks.c5
-rw-r--r--arch/x86/kernel/process.c164
-rw-r--r--arch/x86/kernel/process_32.c19
-rw-r--r--arch/x86/kernel/process_64.c25
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/pvclock.c6
-rw-r--r--arch/x86/kernel/reboot.c16
-rw-r--r--arch/x86/kernel/rtc.c9
-rw-r--r--arch/x86/kernel/setup.c42
-rw-r--r--arch/x86/kernel/setup_percpu.c3
-rw-r--r--arch/x86/kernel/signal.c3
-rw-r--r--arch/x86/kernel/signal_compat.c4
-rw-r--r--arch/x86/kernel/smp.c17
-rw-r--r--arch/x86/kernel/smpboot.c134
-rw-r--r--arch/x86/kernel/stacktrace.c2
-rw-r--r--arch/x86/kernel/step.c1
-rw-r--r--arch/x86/kernel/sys_x86_64.c1
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/test_nx.c173
-rw-r--r--arch/x86/kernel/test_rodata.c75
-rw-r--r--arch/x86/kernel/tls.c2
-rw-r--r--arch/x86/kernel/tracepoint.c3
-rw-r--r--arch/x86/kernel/traps.c30
-rw-r--r--arch/x86/kernel/tsc.c97
-rw-r--r--arch/x86/kernel/tsc_msr.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c304
-rw-r--r--arch/x86/kernel/unwind_frame.c249
-rw-r--r--arch/x86/kernel/vm86_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S3
-rw-r--r--arch/x86/kernel/x86_init.c2
125 files changed, 5516 insertions, 3358 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 79076d75bdbf..84c00592d359 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -75,7 +75,7 @@ apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SMP) += smpboot.o
-obj-$(CONFIG_SMP) += tsc_sync.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
@@ -100,8 +100,6 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
-obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
-obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
@@ -123,6 +121,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
ifdef CONFIG_FRAME_POINTER
obj-y += unwind_frame.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index c280df6b2aa2..ea3046e0b0cf 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
struct acpi_hest_ia_corrected *cmc;
struct acpi_hest_ia_error_bank *mc_bank;
- if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
- return 0;
-
cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
if (!cmc->enabled)
return 0;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 931ced8ca345..b2879cc23db4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -35,6 +35,7 @@
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/pci.h>
+#include <linux/efi-bgrt.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
@@ -76,6 +77,7 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
+#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
* Hotplug side:
@@ -88,6 +90,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
* ->ioapic_lock
*/
static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
/* --------------------------------------------------------------------------
Boot-time Configuration
@@ -176,10 +179,15 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return -EINVAL;
}
+ if (!enabled) {
+ ++disabled_cpus;
+ return -EINVAL;
+ }
+
if (boot_cpu_physical_apicid != -1U)
ver = boot_cpu_apic_version;
- cpu = __generic_processor_info(id, ver, enabled);
+ cpu = generic_processor_info(id, ver);
if (cpu >= 0)
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
@@ -707,13 +715,13 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
nid = acpi_get_node(handle);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
set_apicid_to_node(physid, nid);
numa_set_node(cpu, nid);
}
@@ -721,11 +729,12 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
return 0;
}
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
+ int *pcpu)
{
int cpu;
- cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED);
+ cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
if (cpu < 0) {
pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
return cpu;
@@ -928,6 +937,13 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
x86_platform.legacy.devices.pnpbios = 0;
}
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("ACPI: i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
pr_debug("ACPI: not registering RTC platform device\n");
x86_platform.legacy.rtc = 0;
@@ -1548,6 +1564,12 @@ int __init early_acpi_boot_init(void)
return 0;
}
+static int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+ efi_bgrt_init(table);
+ return 0;
+}
+
int __init acpi_boot_init(void)
{
/* those are executed after early-quirks are executed */
@@ -1572,6 +1594,8 @@ int __init acpi_boot_init(void)
acpi_process_madt();
acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
+ if (IS_ENABLED(CONFIG_ACPI_BGRT))
+ acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
if (!acpi_noirq)
x86_init.pci.init = pci_acpi_init;
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index af15f4444330..8233a630280f 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -12,7 +12,6 @@
#include <linux/sched.h>
#include <acpi/processor.h>
-#include <asm/acpi.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
@@ -89,7 +88,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
retval = 0;
/* If the HW does not support any sub-states in this C-state */
if (num_cstate_subtype == 0) {
- pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n",
+ cx->address, edx_part);
retval = -1;
goto out;
}
@@ -104,8 +104,8 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
if (!mwait_supported[cstate_type]) {
mwait_supported[cstate_type] = 1;
printk(KERN_DEBUG
- "Monitor-Mwait will be used to enter C-%d "
- "state\n", cx->type);
+ "Monitor-Mwait will be used to enter C-%d state\n",
+ cx->type);
}
snprintf(cx->desc,
ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x",
@@ -166,6 +166,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter);
static int __init ffh_cstate_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+
if (c->x86_vendor != X86_VENDOR_INTEL)
return -1;
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
FRAME_END
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cb272a7a5a3..c5b8f760473c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -337,7 +337,11 @@ done:
n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}
-static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
unsigned long flags;
@@ -346,7 +350,6 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
local_irq_save(flags);
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
- sync_core();
local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
@@ -359,9 +362,12 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
* This implies that asymmetric systems where APs have less capabilities than
* the boot processor are not handled. Tough. Make sure you disable such
* features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
*/
-void __init_or_module apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
@@ -667,7 +673,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
unsigned long flags;
local_irq_save(flags);
memcpy(addr, opcode, len);
- sync_core();
local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 63ff468a7986..df083efe6ee0 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
@@ -695,7 +696,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
return -1;
}
-static struct dma_map_ops gart_dma_ops = {
+static const struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
.map_page = gart_map_page,
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4fdf6230d93c..458da8509b75 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -13,8 +13,20 @@
#include <linux/spinlock.h>
#include <asm/amd_nb.h>
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
static u32 *flush_words;
+static const struct pci_device_id amd_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ {}
+};
+
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
-EXPORT_SYMBOL(amd_nb_misc_ids);
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
@@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{}
};
@@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
{ }
};
-struct amd_northbridge_info amd_northbridges;
-EXPORT_SYMBOL(amd_northbridges);
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
static struct pci_dev *next_northbridge(struct pci_dev *dev,
const struct pci_device_id *ids)
@@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
return dev;
}
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ root = node_to_amd_nb(node)->root;
+ if (!root)
+ goto out;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(root, 0x60, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ goto out_unlock;
+ }
+
+ err = (write ? pci_write_config_dword(root, 0x64, *value)
+ : pci_read_config_dword(root, 0x64, value));
+ if (err)
+ pr_warn("Error %s SMN address 0x%x.\n",
+ (write ? "writing to" : "reading from"), address);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ struct pci_dev *F4;
+ u32 ficaa;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ ficaa = 1;
+ ficaa |= reg & 0x3FC;
+ ficaa |= (func & 0x7) << 11;
+ ficaa |= instance_id << 16;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(F4, 0x5C, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, 0x98, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
int amd_cache_northbridges(void)
{
u16 i = 0;
struct amd_northbridge *nb;
- struct pci_dev *misc, *link;
+ struct pci_dev *root, *misc, *link;
- if (amd_nb_num())
+ if (amd_northbridges.num)
return 0;
misc = NULL;
@@ -74,15 +198,17 @@ int amd_cache_northbridges(void)
if (!i)
return -ENODEV;
- nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
amd_northbridges.num = i;
- link = misc = NULL;
- for (i = 0; i != amd_nb_num(); i++) {
+ link = misc = root = NULL;
+ for (i = 0; i != amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->root = root =
+ next_northbridge(root, amd_root_ids);
node_to_amd_nb(i)->misc = misc =
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
@@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
{
u32 address;
u64 base, msr;
- unsigned segn_busn_bits;
+ unsigned int segn_busn_bits;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return NULL;
/* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ if (boot_cpu_data.x86 < 0x10)
return NULL;
address = MSR_FAM10H_MMIO_CONF_BASE;
@@ -226,14 +352,14 @@ static void amd_cache_gart(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
if (!flush_words) {
amd_northbridges.flags &= ~AMD_NB_GART;
pr_notice("Cannot initialize GART flush words, GART support disabled\n");
return;
}
- for (i = 0; i != amd_nb_num(); i++)
+ for (i = 0; i != amd_northbridges.num; i++)
pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
}
@@ -246,18 +372,20 @@ void amd_flush_garts(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- /* Avoid races between AGP and IOMMU. In theory it's not needed
- but I'm not sure if the hardware won't lose flush requests
- when another is pending. This whole thing is so expensive anyways
- that it doesn't matter to serialize more. -AK */
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
flush_words[i] | 1);
flushed++;
}
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 456316f6c868..65721dc73bd8 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -234,7 +234,7 @@ static __init int apbt_late_init(void)
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
- return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL,
+ return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
apbt_cpu_dead);
}
fs_initcall(apbt_late_init);
@@ -247,7 +247,7 @@ void apbt_setup_secondary_clock(void) {}
static int apbt_clocksource_register(void)
{
u64 start, now;
- cycle_t t1;
+ u64 t1;
/* Start the counter, use timer 2 as source, timer 0/1 for event */
dw_apb_clocksource_start(clocksource_apbt);
@@ -355,7 +355,7 @@ unsigned long apbt_quick_calibrate(void)
{
int i, scale;
u64 old, new;
- cycle_t t1, t2;
+ u64 t1, t2;
unsigned long khz = 0;
u32 loop, shift;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 88c657b057e2..8ccb7ef512e0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,7 +48,6 @@
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/time.h>
#include <asm/smp.h>
@@ -530,18 +529,19 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
* The local apic timer can be used for any function which is CPU local.
*/
static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC |
- CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
- | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_state_shutdown = lapic_timer_shutdown,
- .set_state_periodic = lapic_timer_set_periodic,
- .set_state_oneshot = lapic_timer_set_oneshot,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_state_oneshot_stopped = lapic_timer_shutdown,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
@@ -894,11 +894,13 @@ void __init setup_boot_APIC_clock(void)
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
/*
@@ -1244,7 +1246,7 @@ static void lapic_setup_esr(void)
/**
* setup_local_APIC - setup the local APIC
*
- * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Used to setup local APIC while initializing BSP or bringing up APs.
* Always called with preemption disabled.
*/
void setup_local_APIC(void)
@@ -1608,24 +1610,15 @@ static inline void try_to_enable_x2apic(int remap_mode) { }
static inline void __x2apic_enable(void) { }
#endif /* !CONFIG_X86_X2APIC */
-static int __init try_to_enable_IR(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- if (!x2apic_enabled() && skip_ioapic_setup) {
- pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
- return -1;
- }
-#endif
- return irq_remapping_enable();
-}
-
void __init enable_IR_x2apic(void)
{
unsigned long flags;
int ret, ir_stat;
- if (skip_ioapic_setup)
+ if (skip_ioapic_setup) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return;
+ }
ir_stat = irq_remapping_prepare();
if (ir_stat < 0 && !x2apic_supported())
@@ -1643,7 +1636,7 @@ void __init enable_IR_x2apic(void)
/* If irq_remapping_prepare() succeeded, try to enable it */
if (ir_stat >= 0)
- ir_stat = try_to_enable_IR();
+ ir_stat = irq_remapping_enable();
/* ir_stat contains the remap mode or an error code */
try_to_enable_x2apic(ir_stat);
@@ -1863,14 +1856,14 @@ static void __smp_spurious_interrupt(u8 vector)
"should never happen.\n", vector, smp_processor_id());
}
-__visible void smp_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_spurious_interrupt(~regs->orig_ax);
exiting_irq();
}
-__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_spurious_interrupt(struct pt_regs *regs)
{
u8 vector = ~regs->orig_ax;
@@ -1921,14 +1914,14 @@ static void __smp_error_interrupt(struct pt_regs *regs)
}
-__visible void smp_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_error_interrupt(regs);
exiting_irq();
}
-__visible void smp_trace_error_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
@@ -2027,8 +2020,8 @@ void disconnect_bsp_APIC(int virt_wire_setup)
/*
* The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
* contiguously, it equals to current allocated max logical CPU ID plus 1.
- * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
- * nr_logical_cpuids is nr_cpu_ids.
+ * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
+ * so the maximum of nr_logical_cpuids is nr_cpu_ids.
*
* NOTE: Reserve 0 for BSP.
*/
@@ -2060,17 +2053,17 @@ static int allocate_logical_cpuid(int apicid)
/* Allocate a new cpuid. */
if (nr_logical_cpuids >= nr_cpu_ids) {
- WARN_ONCE(1, "Only %d processors supported."
+ WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %i reached. "
"Processor %d/0x%x and the rest are ignored.\n",
- nr_cpu_ids - 1, nr_logical_cpuids, apicid);
- return -1;
+ nr_cpu_ids, nr_logical_cpuids, apicid);
+ return -EINVAL;
}
cpuid_to_apicid[nr_logical_cpuids] = apicid;
return nr_logical_cpuids++;
}
-int __generic_processor_info(int apicid, int version, bool enabled)
+int generic_processor_info(int apicid, int version)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2093,7 +2086,7 @@ int __generic_processor_info(int apicid, int version, bool enabled)
* Since fixing handling of boot_cpu_physical_apicid requires
* another discussion and tests on each platform, we leave it
* for now and here we use read_apic_id() directly in this
- * function, generic_processor_info().
+ * function, __generic_processor_info().
*/
if (disabled_cpu_apicid != BAD_APICID &&
disabled_cpu_apicid != read_apic_id() &&
@@ -2128,11 +2121,9 @@ int __generic_processor_info(int apicid, int version, bool enabled)
if (num_processors >= nr_cpu_ids) {
int thiscpu = max + disabled_cpus;
- if (enabled) {
- pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
- "reached. Processor %d/0x%x ignored.\n",
- max, thiscpu, apicid);
- }
+ pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+ "reached. Processor %d/0x%x ignored.\n",
+ max, thiscpu, apicid);
disabled_cpus++;
return -EINVAL;
@@ -2159,21 +2150,6 @@ int __generic_processor_info(int apicid, int version, bool enabled)
}
/*
- * This can happen on physical hotplug. The sanity check at boot time
- * is done from native_smp_prepare_cpus() after num_possible_cpus() is
- * established.
- */
- if (topology_update_package_map(apicid, cpu) < 0) {
- int thiscpu = max + disabled_cpus;
-
- pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
-
- disabled_cpus++;
- return -ENOSPC;
- }
-
- /*
* Validate version
*/
if (version == 0x0) {
@@ -2199,23 +2175,13 @@ int __generic_processor_info(int apicid, int version, bool enabled)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
-
- if (enabled) {
- num_processors++;
- physid_set(apicid, phys_cpu_present_map);
- set_cpu_present(cpu, true);
- } else {
- disabled_cpus++;
- }
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ num_processors++;
return cpu;
}
-int generic_processor_info(int apicid, int version)
-{
- return __generic_processor_info(apicid, version, true);
-}
-
int hard_smp_processor_id(void)
{
return read_apic_id();
@@ -2263,6 +2229,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
/* Should happen once for each apic */
WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->native_eoi_write = (*drv)->eoi_write;
(*drv)->eoi_write = eoi_write;
}
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 48e6d84f173e..347bb9f65737 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,7 +48,6 @@
#include <linux/bootmem.h>
#include <asm/irqdomain.h>
-#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/cpu.h>
@@ -1108,12 +1107,12 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
ioapic = mp_find_ioapic(gsi);
if (ioapic < 0)
- return -1;
+ return -ENODEV;
pin = mp_find_ioapic_pin(ioapic, gsi);
idx = find_irq_entry(ioapic, pin, mp_INT);
if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
- return -1;
+ return -ENODEV;
return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
}
@@ -1876,6 +1875,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_ack = irq_chip_ack_parent,
.irq_eoi = ioapic_ack_level,
.irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -1887,6 +1887,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = {
.irq_ack = irq_chip_ack_parent,
.irq_eoi = ioapic_ir_ack_level,
.irq_set_affinity = ioapic_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -2116,6 +2117,7 @@ static inline void __init check_timer(void)
if (idx != -1 && irq_trigger(idx))
unmask_ioapic_irq(irq_get_chip_data(0));
}
+ irq_domain_deactivate_irq(irq_data);
irq_domain_activate_irq(irq_data);
if (timer_irq_works()) {
if (disable_timer_pin_1 > 0)
@@ -2137,6 +2139,7 @@ static inline void __init check_timer(void)
* legacy devices should be connected to IO APIC #0
*/
replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+ irq_domain_deactivate_irq(irq_data);
irq_domain_activate_irq(irq_data);
legacy_pic->unmask(0);
if (timer_irq_works()) {
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 015bbf30e3e3..c61aec7e65f4 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -82,7 +82,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (domain == NULL)
return -ENOSYS;
- return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+ return msi_domain_alloc_irqs(domain, &dev->dev, nvec);
}
void native_teardown_msi_irq(unsigned int irq)
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 5d30c5e42bb1..f3557a1eb562 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -559,7 +559,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
__send_cleanup_vector(data);
}
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 200af5ae9662..5a35f208ed95 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -191,7 +191,7 @@ static int x2apic_cluster_probe(void)
if (!x2apic_mode)
return 0;
- ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+ ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "x86/x2apic:prepare",
x2apic_prepare_cpu, x2apic_dead_cpu);
if (ret < 0) {
pr_err("Failed to register X2APIC_PREPARE\n");
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 35690a168cf7..86f20cc0a65e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -41,40 +41,44 @@
DEFINE_PER_CPU(int, x2apic_extra_bits);
-#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
-
-static enum uv_system_type uv_system_type;
-static u64 gru_start_paddr, gru_end_paddr;
-static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
-static u64 gru_dist_lmask, gru_dist_umask;
-static union uvh_apicid uvh_apicid;
-
-/* info derived from CPUID */
+static enum uv_system_type uv_system_type;
+static bool uv_hubless_system;
+static u64 gru_start_paddr, gru_end_paddr;
+static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
+static u64 gru_dist_lmask, gru_dist_umask;
+static union uvh_apicid uvh_apicid;
+
+/* Information derived from CPUID: */
static struct {
unsigned int apicid_shift;
unsigned int apicid_mask;
unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */
unsigned int pnode_mask;
unsigned int gpa_shift;
+ unsigned int gnode_shift;
} uv_cpuid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+
unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static struct apic apic_x2apic_uv_x;
static struct uv_hub_info_s uv_hub_info_node0;
-/* Set this to use hardware error handler instead of kernel panic */
+/* Set this to use hardware error handler instead of kernel panic: */
static int disable_uv_undefined_panic = 1;
+
unsigned long uv_undefined(char *str)
{
if (likely(!disable_uv_undefined_panic))
panic("UV: error: undefined MMR: %s\n", str);
else
pr_crit("UV: error: undefined MMR: %s\n", str);
- return ~0ul; /* cause a machine fault */
+
+ /* Cause a machine fault: */
+ return ~0ul;
}
EXPORT_SYMBOL(uv_undefined);
@@ -85,18 +89,19 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
val = *mmr;
early_iounmap(mmr, sizeof(*mmr));
+
return val;
}
static inline bool is_GRU_range(u64 start, u64 end)
{
if (gru_dist_base) {
- u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */
- u64 sl = start & gru_dist_lmask; /* base offset bits */
+ u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
+ u64 sl = start & gru_dist_lmask; /* Base offset bits */
u64 eu = end & gru_dist_umask;
u64 el = end & gru_dist_lmask;
- /* Must reside completely within a single GRU range */
+ /* Must reside completely within a single GRU range: */
return (sl == gru_dist_base && el == gru_dist_base &&
su >= gru_first_node_paddr &&
su <= gru_last_node_paddr &&
@@ -133,13 +138,14 @@ static int __init early_get_pnodeid(void)
break;
case UV4_HUB_PART_NUMBER:
uv_min_hub_revision_id += UV4_HUB_REVISION_BASE - 1;
+ uv_cpuid.gnode_shift = 2; /* min partition is 4 sockets */
break;
}
uv_hub_info->hub_revision = uv_min_hub_revision_id;
uv_cpuid.pnode_mask = (1 << m_n_config.s.n_skt) - 1;
pnode = (node_id.s.node_id >> 1) & uv_cpuid.pnode_mask;
- uv_cpuid.gpa_shift = 46; /* default unless changed */
+ uv_cpuid.gpa_shift = 46; /* Default unless changed */
pr_info("UV: rev:%d part#:%x nodeid:%04x n_skt:%d pnmsk:%x pn:%x\n",
node_id.s.revision, node_id.s.part_number, node_id.s.node_id,
@@ -147,11 +153,12 @@ static int __init early_get_pnodeid(void)
return pnode;
}
-/* [copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
-#define SMT_LEVEL 0 /* leaf 0xb SMT level */
-#define INVALID_TYPE 0 /* leaf 0xb sub-leaf types */
-#define SMT_TYPE 1
-#define CORE_TYPE 2
+/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
+
+#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
+#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */
+#define SMT_TYPE 1
+#define CORE_TYPE 2
#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
@@ -165,11 +172,13 @@ static void set_x2apic_bits(void)
pr_info("UV: CPU does not have CPUID.11\n");
return;
}
+
cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
pr_info("UV: CPUID.11 not implemented\n");
return;
}
+
sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
sub_index = 1;
do {
@@ -180,8 +189,9 @@ static void set_x2apic_bits(void)
}
sub_index++;
} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
- uv_cpuid.apicid_shift = 0;
- uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
uv_cpuid.socketid_shift = sid_shift;
}
@@ -192,10 +202,8 @@ static void __init early_get_apic_socketid_shift(void)
set_x2apic_bits();
- pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n",
- uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
- pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n",
- uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
+ pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
+ pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
}
/*
@@ -208,10 +216,8 @@ static void __init uv_set_apicid_hibit(void)
union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
if (is_uv1_hub()) {
- apicid_mask.v =
- uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
- uv_apicid_hibits =
- apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+ apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
}
}
@@ -220,20 +226,26 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
int pnodeid;
int uv_apic;
- if (strncmp(oem_id, "SGI", 3) != 0)
+ if (strncmp(oem_id, "SGI", 3) != 0) {
+ if (strncmp(oem_id, "NSGI", 4) == 0) {
+ uv_hubless_system = true;
+ pr_info("UV: OEM IDs %s/%s, HUBLESS\n",
+ oem_id, oem_table_id);
+ }
return 0;
+ }
if (numa_off) {
pr_err("UV: NUMA is off, disabling UV support\n");
return 0;
}
- /* Setup early hub type field in uv_hub_info for Node 0 */
+ /* Set up early hub type field in uv_hub_info for Node 0 */
uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
/*
* Determine UV arch type.
- * SGI: UV100/1000
+ * SGI: UV100/1000
* SGI2: UV2000/3000
* SGI3: UV300 (truncated to 4 chars because of different varieties)
* SGI4: UV400 (truncated to 4 chars because of different varieties)
@@ -249,31 +261,32 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
pnodeid = early_get_pnodeid();
early_get_apic_socketid_shift();
- x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
- if (!strcmp(oem_table_id, "UVX")) { /* most common */
+ if (!strcmp(oem_table_id, "UVX")) {
+ /* This is the most common hardware variant: */
uv_system_type = UV_X2APIC;
uv_apic = 0;
- } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */
+ } else if (!strcmp(oem_table_id, "UVH")) {
+ /* Only UV1 systems: */
uv_system_type = UV_NON_UNIQUE_APIC;
- __this_cpu_write(x2apic_extra_bits,
- pnodeid << uvh_apicid.s.pnode_shift);
+ __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift);
uv_set_apicid_hibit();
uv_apic = 1;
- } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */
- uv_system_type = UV_LEGACY_APIC; /* very small systems */
+ } else if (!strcmp(oem_table_id, "UVL")) {
+ /* Only used for very small systems: */
+ uv_system_type = UV_LEGACY_APIC;
uv_apic = 0;
} else {
goto badbios;
}
- pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
- oem_id, oem_table_id, uv_system_type,
- uv_min_hub_revision_id, uv_apic);
+ pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", oem_id, oem_table_id, uv_system_type, uv_min_hub_revision_id, uv_apic);
return uv_apic;
@@ -294,6 +307,12 @@ int is_uv_system(void)
}
EXPORT_SYMBOL_GPL(is_uv_system);
+int is_uv_hubless(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(is_uv_hubless);
+
void **__uv_hub_info_list;
EXPORT_SYMBOL_GPL(__uv_hub_info_list);
@@ -306,16 +325,18 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
unsigned long sn_rtc_cycles_per_second;
EXPORT_SYMBOL(sn_rtc_cycles_per_second);
-/* the following values are used for the per node hub info struct */
-static __initdata unsigned short *_node_to_pnode;
-static __initdata unsigned short _min_socket, _max_socket;
-static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
-static __initdata struct uv_gam_range_entry *uv_gre_table;
-static __initdata struct uv_gam_parameters *uv_gp_table;
-static __initdata unsigned short *_socket_to_node;
-static __initdata unsigned short *_socket_to_pnode;
-static __initdata unsigned short *_pnode_to_socket;
-static __initdata struct uv_gam_range_s *_gr_table;
+/* The following values are used for the per node hub info struct */
+static __initdata unsigned short *_node_to_pnode;
+static __initdata unsigned short _min_socket, _max_socket;
+static __initdata unsigned short _min_pnode, _max_pnode, _gr_table_len;
+static __initdata struct uv_gam_range_entry *uv_gre_table;
+static __initdata struct uv_gam_parameters *uv_gp_table;
+static __initdata unsigned short *_socket_to_node;
+static __initdata unsigned short *_socket_to_pnode;
+static __initdata unsigned short *_pnode_to_socket;
+
+static __initdata struct uv_gam_range_s *_gr_table;
+
#define SOCK_EMPTY ((unsigned short)~0)
extern int uv_hub_info_version(void)
@@ -324,7 +345,7 @@ extern int uv_hub_info_version(void)
}
EXPORT_SYMBOL(uv_hub_info_version);
-/* Build GAM range lookup table */
+/* Build GAM range lookup table: */
static __init void build_uv_gr_table(void)
{
struct uv_gam_range_entry *gre = uv_gre_table;
@@ -342,25 +363,24 @@ static __init void build_uv_gr_table(void)
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE) {
- if (!ram_limit) { /* mark hole between ram/non-ram */
+ if (!ram_limit) {
+ /* Mark hole between RAM/non-RAM: */
ram_limit = last_limit;
last_limit = gre->limit;
lsid++;
continue;
}
last_limit = gre->limit;
- pr_info("UV: extra hole in GAM RE table @%d\n",
- (int)(gre - uv_gre_table));
+ pr_info("UV: extra hole in GAM RE table @%d\n", (int)(gre - uv_gre_table));
continue;
}
if (_max_socket < gre->sockid) {
- pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n",
- gre->sockid, _max_socket,
- (int)(gre - uv_gre_table));
+ pr_err("UV: GAM table sockid(%d) too large(>%d) @%d\n", gre->sockid, _max_socket, (int)(gre - uv_gre_table));
continue;
}
sid = gre->sockid - _min_socket;
- if (lsid < sid) { /* new range */
+ if (lsid < sid) {
+ /* New range: */
grt = &_gr_table[indx];
grt->base = lindx;
grt->nasid = gre->nasid;
@@ -369,27 +389,32 @@ static __init void build_uv_gr_table(void)
lindx = indx++;
continue;
}
- if (lsid == sid && !ram_limit) { /* update range */
- if (grt->limit == last_limit) { /* .. if contiguous */
+ /* Update range: */
+ if (lsid == sid && !ram_limit) {
+ /* .. if contiguous: */
+ if (grt->limit == last_limit) {
grt->limit = last_limit = gre->limit;
continue;
}
}
- if (!ram_limit) { /* non-contiguous ram range */
+ /* Non-contiguous RAM range: */
+ if (!ram_limit) {
grt++;
grt->base = lindx;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
continue;
}
- grt++; /* non-contiguous/non-ram */
- grt->base = grt - _gr_table; /* base is this entry */
+ /* Non-contiguous/non-RAM: */
+ grt++;
+ /* base is this entry */
+ grt->base = grt - _gr_table;
grt->nasid = gre->nasid;
grt->limit = last_limit = gre->limit;
lsid++;
}
- /* shorten table if possible */
+ /* Shorten table if possible */
grt++;
i = grt - _gr_table;
if (i < _gr_table_len) {
@@ -403,16 +428,15 @@ static __init void build_uv_gr_table(void)
}
}
- /* display resultant gam range table */
+ /* Display resultant GAM range table: */
for (i = 0, grt = _gr_table; i < _gr_table_len; i++, grt++) {
+ unsigned long start, end;
int gb = grt->base;
- unsigned long start = gb < 0 ? 0 :
- (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
- unsigned long end =
- (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
- pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n",
- i, grt->nasid, start, end, gb);
+ start = gb < 0 ? 0 : (unsigned long)_gr_table[gb].limit << UV_GAM_RANGE_SHFT;
+ end = (unsigned long)grt->limit << UV_GAM_RANGE_SHFT;
+
+ pr_info("UV: GAM Range %2d %04x 0x%013lx-0x%013lx (%d)\n", i, grt->nasid, start, end, gb);
}
}
@@ -423,16 +447,19 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
pnode = uv_apicid_to_pnode(phys_apicid);
phys_apicid |= uv_apicid_hibits;
+
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
+
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
return 0;
@@ -566,7 +593,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.apic_id_registered = uv_apic_id_registered,
.irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .irq_dest_mode = 0, /* Physical */
.target_cpus = online_target_cpus,
.disable_esr = 0,
@@ -627,23 +654,22 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
switch (i) {
case 0:
m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR;
break;
case 1:
m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR;
break;
case 2:
m_redirect = UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR;
- m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
+ m_overlay = UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR;
break;
}
alias.v = uv_read_local_mmr(m_overlay);
if (alias.s.enable && alias.s.base == 0) {
*size = (1UL << alias.s.m_alias);
redirect.v = uv_read_local_mmr(m_redirect);
- *base = (unsigned long)redirect.s.dest_base
- << DEST_SHIFT;
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
return;
}
}
@@ -652,8 +678,7 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
enum map_type {map_wb, map_uc};
-static __init void map_high(char *id, unsigned long base, int pshift,
- int bshift, int max_pnode, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int pshift, int bshift, int max_pnode, enum map_type map_type)
{
unsigned long bytes, paddr;
@@ -678,16 +703,19 @@ static __init void map_gru_distributed(unsigned long c)
int nid;
gru.v = c;
- /* only base bits 42:28 relevant in dist mode */
+
+ /* Only base bits 42:28 relevant in dist mode */
gru_dist_base = gru.v & 0x000007fff0000000UL;
if (!gru_dist_base) {
pr_info("UV: Map GRU_DIST base address NULL\n");
return;
}
+
bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
+
for_each_online_node(nid) {
paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
gru_dist_base;
@@ -695,11 +723,12 @@ static __init void map_gru_distributed(unsigned long c)
gru_first_node_paddr = min(paddr, gru_first_node_paddr);
gru_last_node_paddr = max(paddr, gru_last_node_paddr);
}
+
/* Save upper (63:M) bits of address only for is_GRU_range */
gru_first_node_paddr &= gru_dist_umask;
gru_last_node_paddr &= gru_dist_umask;
- pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n",
- gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
+
+ pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
}
static __init void map_gru_high(int max_pnode)
@@ -719,6 +748,7 @@ static __init void map_gru_high(int max_pnode)
map_gru_distributed(gru.v);
return;
}
+
base = (gru.v & mask) >> shift;
map_high("GRU", base, shift, shift, max_pnode, map_wb);
gru_start_paddr = ((u64)base << shift);
@@ -772,8 +802,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
id = mmiohs[index].id;
overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
- pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
- id, overlay.v, overlay.s3.base, overlay.s3.m_io);
+
+ pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", id, overlay.v, overlay.s3.base, overlay.s3.m_io);
if (!overlay.s3.enable) {
pr_info("UV: %s disabled\n", id);
return;
@@ -784,7 +814,8 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
m_io = overlay.s3.m_io;
mmr = mmiohs[index].redirect;
n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
- min_pnode *= 2; /* convert to NASID */
+ /* Convert to NASID: */
+ min_pnode *= 2;
max_pnode *= 2;
max_io = lnasid = fi = li = -1;
@@ -793,16 +824,18 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
redirect.v = uv_read_local_mmr(mmr + i * 8);
nasid = redirect.s3.nasid;
+ /* Invalid NASID: */
if (nasid < min_pnode || max_pnode < nasid)
- nasid = -1; /* invalid NASID */
+ nasid = -1;
if (nasid == lnasid) {
li = i;
- if (i != n-1) /* last entry check */
+ /* Last entry check: */
+ if (i != n-1)
continue;
}
- /* check if we have a cached (or last) redirect to print */
+ /* Check if we have a cached (or last) redirect to print: */
if (lnasid != -1 || (i == n-1 && nasid != -1)) {
unsigned long addr1, addr2;
int f, l;
@@ -814,12 +847,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
f = fi;
l = li;
}
- addr1 = (base << shift) +
- f * (1ULL << m_io);
- addr2 = (base << shift) +
- (l + 1) * (1ULL << m_io);
- pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
- id, fi, li, lnasid, addr1, addr2);
+ addr1 = (base << shift) + f * (1ULL << m_io);
+ addr2 = (base << shift) + (l + 1) * (1ULL << m_io);
+ pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2);
if (max_io < l)
max_io = l;
}
@@ -827,8 +857,7 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
lnasid = nasid;
}
- pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
- id, base, shift, m_io, max_io);
+ pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", id, base, shift, m_io, max_io);
if (max_io >= 0)
map_high(id, base, shift, m_io, max_io, map_uc);
@@ -841,36 +870,35 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode)
int shift, enable, m_io, n_io;
if (is_uv3_hub() || is_uv4_hub()) {
- /* Map both MMIOH Regions */
+ /* Map both MMIOH regions: */
map_mmioh_high_uv3(0, min_pnode, max_pnode);
map_mmioh_high_uv3(1, min_pnode, max_pnode);
return;
}
if (is_uv1_hub()) {
- mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
- shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- mmioh.v = uv_read_local_mmr(mmr);
- enable = !!mmioh.s1.enable;
- base = mmioh.s1.base;
- m_io = mmioh.s1.m_io;
- n_io = mmioh.s1.n_io;
+ mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s1.enable;
+ base = mmioh.s1.base;
+ m_io = mmioh.s1.m_io;
+ n_io = mmioh.s1.n_io;
} else if (is_uv2_hub()) {
- mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
- shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
- mmioh.v = uv_read_local_mmr(mmr);
- enable = !!mmioh.s2.enable;
- base = mmioh.s2.base;
- m_io = mmioh.s2.m_io;
- n_io = mmioh.s2.n_io;
- } else
+ mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
+ shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+ mmioh.v = uv_read_local_mmr(mmr);
+ enable = !!mmioh.s2.enable;
+ base = mmioh.s2.base;
+ m_io = mmioh.s2.m_io;
+ n_io = mmioh.s2.n_io;
+ } else {
return;
+ }
if (enable) {
max_pnode &= (1 << n_io) - 1;
- pr_info(
- "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
- base, shift, m_io, n_io, max_pnode);
+ pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode);
map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
} else {
pr_info("UV: MMIOH disabled\n");
@@ -888,16 +916,16 @@ static __init void uv_rtc_init(void)
long status;
u64 ticks_per_sec;
- status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
- &ticks_per_sec);
+ status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec);
+
if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
- printk(KERN_WARNING
- "unable to determine platform RTC clock frequency, "
- "guessing.\n");
- /* BIOS gives wrong value for clock freq. so guess */
+ pr_warn("UV: unable to determine platform RTC clock frequency, guessing.\n");
+
+ /* BIOS gives wrong value for clock frequency, so guess: */
sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
- } else
+ } else {
sn_rtc_cycles_per_second = ticks_per_sec;
+ }
}
/*
@@ -908,19 +936,19 @@ static void uv_heartbeat(unsigned long ignored)
struct timer_list *timer = &uv_scir_info->timer;
unsigned char bits = uv_scir_info->state;
- /* flip heartbeat bit */
+ /* Flip heartbeat bit: */
bits ^= SCIR_CPU_HEARTBEAT;
- /* is this cpu idle? */
+ /* Is this CPU idle? */
if (idle_cpu(raw_smp_processor_id()))
bits &= ~SCIR_CPU_ACTIVITY;
else
bits |= SCIR_CPU_ACTIVITY;
- /* update system controller interface reg */
+ /* Update system controller interface reg: */
uv_set_scir_bits(bits);
- /* enable next timer period */
+ /* Enable next timer period: */
mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
@@ -935,7 +963,7 @@ static int uv_heartbeat_enable(unsigned int cpu)
add_timer_on(timer, cpu);
uv_cpu_scir_info(cpu)->enabled = 1;
- /* also ensure that boot cpu is enabled */
+ /* Also ensure that boot CPU is enabled: */
cpu = 0;
}
return 0;
@@ -968,9 +996,11 @@ static __init int uv_init_heartbeat(void)
{
int cpu;
- if (is_uv_system())
+ if (is_uv_system()) {
for_each_online_cpu(cpu)
uv_heartbeat_enable(cpu);
+ }
+
return 0;
}
@@ -979,14 +1009,10 @@ late_initcall(uv_init_heartbeat);
#endif /* !CONFIG_HOTPLUG_CPU */
/* Direct Legacy VGA I/O traffic to designated IOH */
-int uv_set_vga_state(struct pci_dev *pdev, bool decode,
- unsigned int command_bits, u32 flags)
+int uv_set_vga_state(struct pci_dev *pdev, bool decode, unsigned int command_bits, u32 flags)
{
int domain, bus, rc;
- PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
- pdev->devfn, decode, command_bits, flags);
-
if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
return 0;
@@ -997,13 +1023,12 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
bus = pdev->bus->number;
rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
- PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
return rc;
}
/*
- * Called on each cpu to initialize the per_cpu UV data area.
+ * Called on each CPU to initialize the per_cpu UV data area.
* FIXME: hotplug not supported yet
*/
void uv_cpu_init(void)
@@ -1030,90 +1055,80 @@ static void get_mn(struct mn *mnp)
union uvh_rh_gam_config_mmr_u m_n_config;
union uv3h_gr0_gam_gr_config_u m_gr_config;
- m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
- mnp->n_val = m_n_config.s.n_skt;
+ /* Make sure the whole structure is well initialized: */
+ memset(mnp, 0, sizeof(*mnp));
+
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR);
+ mnp->n_val = m_n_config.s.n_skt;
+
if (is_uv4_hub()) {
- mnp->m_val = 0;
- mnp->n_lshift = 0;
+ mnp->m_val = 0;
+ mnp->n_lshift = 0;
} else if (is_uv3_hub()) {
- mnp->m_val = m_n_config.s3.m_skt;
- m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
- mnp->n_lshift = m_gr_config.s3.m_skt;
+ mnp->m_val = m_n_config.s3.m_skt;
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ mnp->n_lshift = m_gr_config.s3.m_skt;
} else if (is_uv2_hub()) {
- mnp->m_val = m_n_config.s2.m_skt;
- mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
+ mnp->m_val = m_n_config.s2.m_skt;
+ mnp->n_lshift = mnp->m_val == 40 ? 40 : 39;
} else if (is_uv1_hub()) {
- mnp->m_val = m_n_config.s1.m_skt;
- mnp->n_lshift = mnp->m_val;
+ mnp->m_val = m_n_config.s1.m_skt;
+ mnp->n_lshift = mnp->m_val;
}
mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0;
}
-void __init uv_init_hub_info(struct uv_hub_info_s *hub_info)
+void __init uv_init_hub_info(struct uv_hub_info_s *hi)
{
- struct mn mn = {0}; /* avoid unitialized warnings */
union uvh_node_id_u node_id;
+ struct mn mn;
get_mn(&mn);
- hub_info->m_val = mn.m_val;
- hub_info->n_val = mn.n_val;
- hub_info->m_shift = mn.m_shift;
- hub_info->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
-
- hub_info->hub_revision = uv_hub_info->hub_revision;
- hub_info->pnode_mask = uv_cpuid.pnode_mask;
- hub_info->min_pnode = _min_pnode;
- hub_info->min_socket = _min_socket;
- hub_info->pnode_to_socket = _pnode_to_socket;
- hub_info->socket_to_node = _socket_to_node;
- hub_info->socket_to_pnode = _socket_to_pnode;
- hub_info->gr_table_len = _gr_table_len;
- hub_info->gr_table = _gr_table;
- hub_info->gpa_mask = mn.m_val ?
+ hi->gpa_mask = mn.m_val ?
(1UL << (mn.m_val + mn.n_val)) - 1 :
(1UL << uv_cpuid.gpa_shift) - 1;
- node_id.v = uv_read_local_mmr(UVH_NODE_ID);
- hub_info->gnode_extra =
- (node_id.s.node_id & ~((1 << mn.n_val) - 1)) >> 1;
-
- hub_info->gnode_upper =
- ((unsigned long)hub_info->gnode_extra << mn.m_val);
+ hi->m_val = mn.m_val;
+ hi->n_val = mn.n_val;
+ hi->m_shift = mn.m_shift;
+ hi->n_lshift = mn.n_lshift ? mn.n_lshift : 0;
+ hi->hub_revision = uv_hub_info->hub_revision;
+ hi->pnode_mask = uv_cpuid.pnode_mask;
+ hi->min_pnode = _min_pnode;
+ hi->min_socket = _min_socket;
+ hi->pnode_to_socket = _pnode_to_socket;
+ hi->socket_to_node = _socket_to_node;
+ hi->socket_to_pnode = _socket_to_pnode;
+ hi->gr_table_len = _gr_table_len;
+ hi->gr_table = _gr_table;
+
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ uv_cpuid.gnode_shift = max_t(unsigned int, uv_cpuid.gnode_shift, mn.n_val);
+ hi->gnode_extra = (node_id.s.node_id & ~((1 << uv_cpuid.gnode_shift) - 1)) >> 1;
+ if (mn.m_val)
+ hi->gnode_upper = (u64)hi->gnode_extra << mn.m_val;
if (uv_gp_table) {
- hub_info->global_mmr_base = uv_gp_table->mmr_base;
- hub_info->global_mmr_shift = uv_gp_table->mmr_shift;
- hub_info->global_gru_base = uv_gp_table->gru_base;
- hub_info->global_gru_shift = uv_gp_table->gru_shift;
- hub_info->gpa_shift = uv_gp_table->gpa_shift;
- hub_info->gpa_mask = (1UL << hub_info->gpa_shift) - 1;
+ hi->global_mmr_base = uv_gp_table->mmr_base;
+ hi->global_mmr_shift = uv_gp_table->mmr_shift;
+ hi->global_gru_base = uv_gp_table->gru_base;
+ hi->global_gru_shift = uv_gp_table->gru_shift;
+ hi->gpa_shift = uv_gp_table->gpa_shift;
+ hi->gpa_mask = (1UL << hi->gpa_shift) - 1;
} else {
- hub_info->global_mmr_base =
- uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
- ~UV_MMR_ENABLE;
- hub_info->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
+ hi->global_mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE;
+ hi->global_mmr_shift = _UV_GLOBAL_MMR64_PNODE_SHIFT;
}
- get_lowmem_redirect(
- &hub_info->lowmem_remap_base, &hub_info->lowmem_remap_top);
-
- hub_info->apic_pnode_shift = uv_cpuid.socketid_shift;
-
- /* show system specific info */
- pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n",
- hub_info->n_val, hub_info->m_val,
- hub_info->m_shift, hub_info->n_lshift);
-
- pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n",
- hub_info->gpa_mask, hub_info->gpa_shift,
- hub_info->pnode_mask, hub_info->apic_pnode_shift);
+ get_lowmem_redirect(&hi->lowmem_remap_base, &hi->lowmem_remap_top);
- pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n",
- hub_info->global_mmr_base, hub_info->global_mmr_shift,
- hub_info->global_gru_base, hub_info->global_gru_shift);
+ hi->apic_pnode_shift = uv_cpuid.socketid_shift;
- pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n",
- hub_info->gnode_upper, hub_info->gnode_extra);
+ /* Show system specific info: */
+ pr_info("UV: N:%d M:%d m_shift:%d n_lshift:%d\n", hi->n_val, hi->m_val, hi->m_shift, hi->n_lshift);
+ pr_info("UV: gpa_mask/shift:0x%lx/%d pnode_mask:0x%x apic_pns:%d\n", hi->gpa_mask, hi->gpa_shift, hi->pnode_mask, hi->apic_pnode_shift);
+ pr_info("UV: mmr_base/shift:0x%lx/%ld gru_base/shift:0x%lx/%ld\n", hi->global_mmr_base, hi->global_mmr_shift, hi->global_gru_base, hi->global_gru_shift);
+ pr_info("UV: gnode_upper:0x%lx gnode_extra:0x%x\n", hi->gnode_upper, hi->gnode_extra);
}
static void __init decode_gam_params(unsigned long ptr)
@@ -1139,12 +1154,9 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (!index) {
pr_info("UV: GAM Range Table...\n");
- pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n",
- "Range", "", "Size", "Type", "NASID",
- "SID", "PN");
+ pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN");
}
- pr_info(
- "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
+ pr_info("UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n",
index++,
(unsigned long)lgre << UV_GAM_RANGE_SHFT,
(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
@@ -1162,29 +1174,32 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
if (pnode_max < gre->pnode)
pnode_max = gre->pnode;
}
- _min_socket = sock_min;
- _max_socket = sock_max;
- _min_pnode = pnode_min;
- _max_pnode = pnode_max;
- _gr_table_len = index;
- pr_info(
- "UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n",
- index, _min_socket, _max_socket, _min_pnode, _max_pnode);
+ _min_socket = sock_min;
+ _max_socket = sock_max;
+ _min_pnode = pnode_min;
+ _max_pnode = pnode_max;
+ _gr_table_len = index;
+
+ pr_info("UV: GRT: %d entries, sockets(min:%x,max:%x) pnodes(min:%x,max:%x)\n", index, _min_socket, _max_socket, _min_pnode, _max_pnode);
}
-static void __init decode_uv_systab(void)
+static int __init decode_uv_systab(void)
{
struct uv_systab *st;
int i;
+ if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)
+ return 0; /* No extended UVsystab required */
+
st = uv_systab;
- if ((!st || st->revision < UV_SYSTAB_VERSION_UV4) && !is_uv4_hub())
- return;
- if (st->revision != UV_SYSTAB_VERSION_UV4_LATEST) {
- pr_crit(
- "UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n",
- st->revision, UV_SYSTAB_VERSION_UV4_LATEST);
- BUG();
+ if ((!st) || (st->revision < UV_SYSTAB_VERSION_UV4_LATEST)) {
+ int rev = st ? st->revision : 0;
+
+ pr_err("UV: BIOS UVsystab version(%x) mismatch, expecting(%x)\n", rev, UV_SYSTAB_VERSION_UV4_LATEST);
+ pr_err("UV: Cannot support UV operations, switching to generic PC\n");
+ uv_system_type = UV_NONE;
+
+ return -EINVAL;
}
for (i = 0; st->entry[i].type != UV_SYSTAB_TYPE_UNUSED; i++) {
@@ -1205,10 +1220,11 @@ static void __init decode_uv_systab(void)
break;
}
}
+ return 0;
}
/*
- * Setup physical blade translations from UVH_NODE_PRESENT_TABLE
+ * Set up physical blade translations from UVH_NODE_PRESENT_TABLE
* .. NB: UVH_NODE_PRESENT_TABLE is going away,
* .. being replaced by GAM Range Table
*/
@@ -1244,14 +1260,13 @@ static void __init build_socket_tables(void)
if (!gre) {
if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) {
pr_info("UV: No UVsystab socket table, ignoring\n");
- return; /* not required */
+ return;
}
- pr_crit(
- "UV: Error: UVsystab address translations not available!\n");
+ pr_crit("UV: Error: UVsystab address translations not available!\n");
BUG();
}
- /* build socket id -> node id, pnode */
+ /* Build socket id -> node id, pnode */
num = maxsock - minsock + 1;
bytes = num * sizeof(_socket_to_node[0]);
_socket_to_node = kmalloc(bytes, GFP_KERNEL);
@@ -1268,27 +1283,27 @@ static void __init build_socket_tables(void)
for (i = 0; i < nump; i++)
_pnode_to_socket[i] = SOCK_EMPTY;
- /* fill in pnode/node/addr conversion list values */
+ /* Fill in pnode/node/addr conversion list values: */
pr_info("UV: GAM Building socket/pnode conversion tables\n");
for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
continue;
i = gre->sockid - minsock;
+ /* Duplicate: */
if (_socket_to_pnode[i] != SOCK_EMPTY)
- continue; /* duplicate */
+ continue;
_socket_to_pnode[i] = gre->pnode;
i = gre->pnode - minpnode;
_pnode_to_socket[i] = gre->sockid;
- pr_info(
- "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
+ pr_info("UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
gre->sockid, gre->type, gre->nasid,
_socket_to_pnode[gre->sockid - minsock],
_pnode_to_socket[gre->pnode - minpnode]);
}
- /* Set socket -> node values */
+ /* Set socket -> node values: */
lnid = -1;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
@@ -1304,7 +1319,7 @@ static void __init build_socket_tables(void)
sockid, apicid, nid);
}
- /* Setup physical blade to pnode translation from GAM Range Table */
+ /* Set up physical blade to pnode translation from GAM Range Table: */
bytes = num_possible_nodes() * sizeof(_node_to_pnode[0]);
_node_to_pnode = kmalloc(bytes, GFP_KERNEL);
BUG_ON(!_node_to_pnode);
@@ -1314,8 +1329,7 @@ static void __init build_socket_tables(void)
for (sockid = minsock; sockid <= maxsock; sockid++) {
if (lnid == _socket_to_node[sockid - minsock]) {
- _node_to_pnode[lnid] =
- _socket_to_pnode[sockid - minsock];
+ _node_to_pnode[lnid] = _socket_to_pnode[sockid - minsock];
break;
}
}
@@ -1332,8 +1346,7 @@ static void __init build_socket_tables(void)
pr_info("UV: Checking socket->node/pnode for identity maps\n");
if (minsock == 0) {
for (i = 0; i < num; i++)
- if (_socket_to_node[i] == SOCK_EMPTY ||
- i != _socket_to_node[i])
+ if (_socket_to_node[i] == SOCK_EMPTY || i != _socket_to_node[i])
break;
if (i >= num) {
kfree(_socket_to_node);
@@ -1354,7 +1367,7 @@ static void __init build_socket_tables(void)
}
}
-void __init uv_system_init(void)
+static void __init uv_system_init_hub(void)
{
struct uv_hub_info_s hub_info = {0};
int bytes, cpu, nodeid;
@@ -1372,8 +1385,13 @@ void __init uv_system_init(void)
map_low_mmrs();
- uv_bios_init(); /* get uv_systab for decoding */
- decode_uv_systab();
+ /* Get uv_systab for decoding: */
+ uv_bios_init();
+
+ /* If there's an UVsystab problem then abort UV init: */
+ if (decode_uv_systab() < 0)
+ return;
+
build_socket_tables();
build_uv_gr_table();
uv_init_hub_info(&hub_info);
@@ -1381,14 +1399,10 @@ void __init uv_system_init(void)
if (!_node_to_pnode)
boot_init_possible_blades(&hub_info);
- /* uv_num_possible_blades() is really the hub count */
- pr_info("UV: Found %d hubs, %d nodes, %d cpus\n",
- uv_num_possible_blades(),
- num_possible_nodes(),
- num_possible_cpus());
+ /* uv_num_possible_blades() is really the hub count: */
+ pr_info("UV: Found %d hubs, %d nodes, %d CPUs\n", uv_num_possible_blades(), num_possible_nodes(), num_possible_cpus());
- uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
- &sn_region_size, &system_serial_number);
+ uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, &sn_region_size, &system_serial_number);
hub_info.coherency_domain_number = sn_coherency_id;
uv_rtc_init();
@@ -1401,33 +1415,31 @@ void __init uv_system_init(void)
struct uv_hub_info_s *new_hub;
if (__uv_hub_info_list[nodeid]) {
- pr_err("UV: Node %d UV HUB already initialized!?\n",
- nodeid);
+ pr_err("UV: Node %d UV HUB already initialized!?\n", nodeid);
BUG();
}
/* Allocate new per hub info list */
- new_hub = (nodeid == 0) ?
- &uv_hub_info_node0 :
- kzalloc_node(bytes, GFP_KERNEL, nodeid);
+ new_hub = (nodeid == 0) ? &uv_hub_info_node0 : kzalloc_node(bytes, GFP_KERNEL, nodeid);
BUG_ON(!new_hub);
__uv_hub_info_list[nodeid] = new_hub;
new_hub = uv_hub_info_list(nodeid);
BUG_ON(!new_hub);
*new_hub = hub_info;
- /* Use information from GAM table if available */
+ /* Use information from GAM table if available: */
if (_node_to_pnode)
new_hub->pnode = _node_to_pnode[nodeid];
- else /* Fill in during cpu loop */
+ else /* Or fill in during CPU loop: */
new_hub->pnode = 0xffff;
+
new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
new_hub->memory_nid = -1;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
- /* Initialize per cpu info */
+ /* Initialize per CPU info: */
for_each_possible_cpu(cpu) {
int apicid = per_cpu(x86_cpu_to_apicid, cpu);
int numa_node_id;
@@ -1438,22 +1450,24 @@ void __init uv_system_init(void)
pnode = uv_apicid_to_pnode(apicid);
uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
- uv_cpu_info_per(cpu)->blade_cpu_id =
- uv_cpu_hub_info(cpu)->nr_possible_cpus++;
+ uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
if (uv_cpu_hub_info(cpu)->memory_nid == -1)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
- if (nodeid != numa_node_id && /* init memoryless node */
+
+ /* Init memoryless node: */
+ if (nodeid != numa_node_id &&
uv_hub_info_list(numa_node_id)->pnode == 0xffff)
uv_hub_info_list(numa_node_id)->pnode = pnode;
else if (uv_cpu_hub_info(cpu)->pnode == 0xffff)
uv_cpu_hub_info(cpu)->pnode = pnode;
+
uv_cpu_scir_info(cpu)->offset = uv_scir_offset(apicid);
}
for_each_node(nodeid) {
unsigned short pnode = uv_hub_info_list(nodeid)->pnode;
- /* Add pnode info for pre-GAM list nodes without cpus */
+ /* Add pnode info for pre-GAM list nodes without CPUs: */
if (pnode == 0xffff) {
unsigned long paddr;
@@ -1479,15 +1493,30 @@ void __init uv_system_init(void)
uv_scir_register_cpu_notifier();
proc_mkdir("sgi_uv", NULL);
- /* register Legacy VGA I/O redirection handler */
+ /* Register Legacy VGA I/O redirection handler: */
pci_register_set_vga_state(uv_set_vga_state);
/*
* For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
- * EFI is not enabled in the kdump kernel.
+ * EFI is not enabled in the kdump kernel:
*/
if (is_kdump_kernel())
reboot_type = BOOT_ACPI;
}
+/*
+ * There is a small amount of UV specific code needed to initialize a
+ * UV system that does not have a "UV HUB" (referred to as "hubless").
+ */
+void __init uv_system_init(void)
+{
+ if (likely(!is_uv_system() && !is_uv_hubless()))
+ return;
+
+ if (is_uv_system())
+ uv_system_init_hub();
+ else
+ uv_nmi_setup_hubless();
+}
+
apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 51287cd90bf6..5a414545e8a3 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -218,7 +218,8 @@
#include <linux/apm_bios.h>
#include <linux/init.h>
#include <linux/time.h>
-#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/cputime.h>
#include <linux/pm.h>
#include <linux/capability.h>
#include <linux/device.h>
@@ -234,7 +235,7 @@
#include <linux/i8253.h>
#include <linux/cpuidle.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
#include <asm/olpc.h>
#include <asm/paravirt.h>
@@ -905,21 +906,21 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
{
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
- static unsigned int last_stime; /* = 0 */
- cputime_t stime;
+ static u64 last_stime; /* = 0 */
+ u64 stime, utime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
recalc:
- task_cputime(current, NULL, &stime);
+ task_cputime(current, &utime, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
} else if (jiffies_since_last_check > idle_period) {
unsigned int idle_percentage;
- idle_percentage = cputime_to_jiffies(stime - last_stime);
+ idle_percentage = nsecs_to_jiffies(stime - last_stime);
idle_percentage *= 100;
idle_percentage /= jiffies_since_last_check;
use_apm_idle = (idle_percentage > idle_threshold);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index c62e015b126c..de827d6ac8c2 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -81,6 +81,7 @@ void common(void) {
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
+ OFFSET(BP_secure_boot, boot_params, secure_boot);
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 210927ee2e74..99332f550c48 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,6 +13,10 @@ static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
int main(void)
{
#ifdef CONFIG_PARAVIRT
@@ -22,6 +26,11 @@ int main(void)
BLANK();
#endif
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+ BLANK();
+#endif
+
#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
ENTRY(cx);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f7d4ef..52000010c62e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-y += bugs.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
-obj-$(CONFIG_X86_32) += bugs.o
-obj-$(CONFIG_X86_64) += bugs_64.o
-
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
@@ -34,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1e81a37c034e..c36140d788fe 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
@@ -20,6 +21,10 @@
#include "cpu.h"
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
/*
* nodes_per_socket: Stores the number of nodes per socket.
* Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -308,17 +313,43 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- node_id = ecx & 7;
- /* get compute unit information */
- smp_num_siblings = ((ebx >> 8) & 3) + 1;
- c->x86_max_cores /= smp_num_siblings;
- c->cpu_core_id = ebx & 0xff;
+ node_id = ecx & 0xff;
+ smp_num_siblings = ((ebx >> 8) & 0xff) + 1;
+
+ if (c->x86 == 0x15)
+ c->cu_id = ebx & 0xff;
+
+ if (c->x86 >= 0x17) {
+ c->cpu_core_id = ebx & 0xff;
+
+ if (smp_num_siblings > 1)
+ c->x86_max_cores /= smp_num_siblings;
+ }
+
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (cpuid_edx(0x80000006)) {
+ if (c->x86 == 0x17) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex id is ApicId[3].
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ }
+ }
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
} else
return;
@@ -329,9 +360,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
cus_per_node = c->x86_max_cores / nodes_per_socket;
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = node_id;
-
/* core id has to be in the [0 .. cores_per_node - 1] range */
c->cpu_core_id %= cus_per_node;
}
@@ -356,15 +384,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
-
- /*
- * Fix percpu cpu_llc_id here as LLC topology is different
- * for Fam17h systems.
- */
- if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
- return;
-
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
#endif
}
@@ -537,8 +556,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (!check_tsc_unstable())
- set_sched_clock_stable();
}
/* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
@@ -585,11 +602,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
-}
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+ /*
+ * Check whether the machine is affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the check
+ * whether the machine is affected in arch_post_acpi_init(), which
+ * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_E400);
+}
static void init_amd_k8(struct cpuinfo_x86 *c)
{
@@ -770,9 +792,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
-
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/* 3DNow or LM implies PREFETCHW */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..a44ef52184df 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,15 +16,19 @@
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
void __init check_bugs(void)
{
identify_boot_cpu();
-#ifndef CONFIG_SMP
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
*
@@ -40,4 +44,18 @@ void __init check_bugs(void)
alternative_instructions();
fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+ alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+#endif
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index a972ac4c7e7d..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 1661d8ec9280..43955ee6715b 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,5 +1,6 @@
-#include <linux/bitops.h>
-#include <linux/kernel.h>
+
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <asm/cpufeature.h>
#include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc9e980c68ec..58094a1f9e9d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -7,7 +7,9 @@
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/delay.h>
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/task.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
@@ -35,6 +37,7 @@
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/mtrr.h>
+#include <asm/hwcap2.h>
#include <linux/numa.h>
#include <asm/asm.h>
#include <asm/bugs.h>
@@ -51,6 +54,8 @@
#include "cpu.h"
+u32 elf_hwcap2 __read_mostly;
+
/* all of these masks are initialized in setup_cpu_local_masks() */
cpumask_var_t cpu_initialized_mask;
cpumask_var_t cpu_callout_mask;
@@ -655,6 +660,16 @@ void cpu_detect(struct cpuinfo_x86 *c)
}
}
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+ int i;
+
+ for (i = 0; i < NCAPINTS; i++) {
+ c->x86_capability[i] &= ~cpu_caps_cleared[i];
+ c->x86_capability[i] |= cpu_caps_set[i];
+ }
+}
+
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -667,13 +682,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_1_EDX] = edx;
}
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
/* Additional Intel-defined flags: level 0x00000007 */
if (c->cpuid_level >= 0x00000007) {
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-
c->x86_capability[CPUID_7_0_EBX] = ebx;
-
- c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
c->x86_capability[CPUID_7_ECX] = ecx;
}
@@ -747,6 +763,13 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
init_scattered_cpuid_features(c);
+
+ /*
+ * Clear/Set all flags overridden by options, after probe.
+ * This needs to happen each time we re-probe, which may happen
+ * several times during CPU initialization.
+ */
+ apply_forced_caps(c);
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -800,14 +823,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
memset(&c->x86_capability, 0, sizeof c->x86_capability);
c->extended_cpuid_level = 0;
- if (!have_cpuid_p())
- identify_cpu_without_cpuid(c);
-
/* cyrix could have cpuid enabled via c_identify()*/
if (have_cpuid_p()) {
cpu_detect(c);
get_cpu_vendor(c);
get_cpu_cap(c);
+ setup_force_cpu_cap(X86_FEATURE_CPUID);
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -817,6 +838,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_bsp_init)
this_cpu->c_bsp_init(c);
+ } else {
+ identify_cpu_without_cpuid(c);
+ setup_clear_cpu_cap(X86_FEATURE_CPUID);
}
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
@@ -979,29 +1003,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
}
/*
- * The physical to logical package id mapping is initialized from the
- * acpi/mptables information. Make sure that CPUID actually agrees with
- * that.
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
*/
-static void sanitize_package_id(struct cpuinfo_x86 *c)
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int pkg, apicid, cpu = smp_processor_id();
+ unsigned int apicid, cpu = smp_processor_id();
apicid = apic->cpu_present_to_apicid(cpu);
- pkg = apicid >> boot_cpu_data.x86_coreid_bits;
- if (apicid != c->initial_apicid) {
- pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n",
+ if (apicid != c->apicid) {
+ pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
cpu, apicid, c->initial_apicid);
- c->initial_apicid = apicid;
- }
- if (pkg != c->phys_proc_id) {
- pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n",
- cpu, pkg, c->phys_proc_id);
- c->phys_proc_id = pkg;
}
- c->logical_proc_id = topology_phys_to_logical_pkg(pkg);
+ BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
#else
c->logical_proc_id = 0;
#endif
@@ -1022,6 +1038,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_model_id[0] = '\0'; /* Unset */
c->x86_max_cores = 1;
c->x86_coreid_bits = 0;
+ c->cu_id = 0xff;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1041,10 +1058,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_identify(c);
/* Clear/Set all flags overridden by options, after probe */
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ apply_forced_caps(c);
#ifdef CONFIG_X86_64
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
@@ -1103,10 +1117,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
* Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
- for (i = 0; i < NCAPINTS; i++) {
- c->x86_capability[i] &= ~cpu_caps_cleared[i];
- c->x86_capability[i] |= cpu_caps_set[i];
- }
+ apply_forced_caps(c);
/*
* On SMP, boot_cpu_data holds the common feature set between
@@ -1132,7 +1143,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
- sanitize_package_id(c);
}
/*
@@ -1172,7 +1182,6 @@ void enable_sep_cpu(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_amd_e400_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
@@ -1188,53 +1197,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
enable_sep_cpu();
#endif
mtrr_ap_init();
+ validate_apic_and_package_id(c);
}
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
-
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_safe(index, &val))
- continue;
- pr_info(" MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
static __init int setup_noclflush(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1268,21 +1233,13 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(", stepping: 0x%x)\n", c->x86_mask);
else
pr_cont(")\n");
-
- print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
- if (c->cpu_index < show_msr)
- __print_cpu_msr();
}
static __init int setup_disablecpuid(char *arg)
{
int bit;
- if (get_option(&arg, &bit) && bit < NCAPINTS*32)
+ if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
setup_clear_cpu_cap(bit);
else
return 0;
@@ -1490,11 +1447,8 @@ void cpu_init(void)
*/
cr4_init_shadow();
- /*
- * Load microcode on this cpu if a valid microcode is available.
- * This is early microcode loading procedure.
- */
- load_ucode_ap();
+ if (cpu)
+ load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
@@ -1555,7 +1509,7 @@ void cpu_init(void)
for (i = 0; i <= IO_BITMAP_LONGS; i++)
t->io_bitmap[i] = ~0UL;
- atomic_inc(&init_mm.mm_count);
+ mmgrab(&init_mm);
me->active_mm = &init_mm;
BUG_ON(me->mm);
enter_lazy_tlb(&init_mm, me);
@@ -1606,7 +1560,7 @@ void cpu_init(void)
/*
* Set up and load the per-CPU TSS and LDT
*/
- atomic_inc(&init_mm.mm_count);
+ mmgrab(&init_mm);
curr->active_mm = &init_mm;
BUG_ON(curr->mm);
enter_lazy_tlb(&init_mm, curr);
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index bd9dcd6b712d..a70fd61095f8 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -9,6 +9,8 @@
#include <asm/pci-direct.h>
#include <asm/tsc.h>
#include <asm/cpufeature.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcd484d2bb03..063197771b8d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -4,6 +4,7 @@
#include <linux/bitops.h>
#include <linux/smp.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
@@ -14,6 +15,9 @@
#include <asm/bugs.h>
#include <asm/cpu.h>
#include <asm/intel-family.h>
+#include <asm/microcode_intel.h>
+#include <asm/hwcap2.h>
+#include <asm/elf.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -61,6 +65,46 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
}
}
+static bool ring3mwait_disabled __read_mostly;
+
+static int __init ring3mwait_disable(char *__unused)
+{
+ ring3mwait_disabled = true;
+ return 0;
+}
+__setup("ring3mwait=disable", ring3mwait_disable);
+
+static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
+{
+ /*
+ * Ring 3 MONITOR/MWAIT feature cannot be detected without
+ * cpu model and family comparison.
+ */
+ if (c->x86 != 6)
+ return;
+ switch (c->x86_model) {
+ case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
+ break;
+ default:
+ return;
+ }
+
+ if (ring3mwait_disabled) {
+ msr_clear_bit(MSR_MISC_FEATURE_ENABLES,
+ MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+ return;
+ }
+
+ msr_set_bit(MSR_MISC_FEATURE_ENABLES,
+ MSR_MISC_FEATURE_ENABLES_RING3MWAIT_BIT);
+
+ set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+
+ if (c == &boot_cpu_data)
+ ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
+}
+
static void early_init_intel(struct cpuinfo_x86 *c)
{
u64 misc_enable;
@@ -78,14 +122,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
- unsigned lower_word;
-
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
- /* Required by the SDM */
- sync_core();
- rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
- }
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
+ c->microcode = intel_get_microcode_revision();
/*
* Atom erratum AAE44/AAF40/AAG38/AAH41:
@@ -124,8 +162,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86_power & (1 << 8)) {
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
- if (!check_tsc_unstable())
- set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -565,6 +601,8 @@ static void init_intel(struct cpuinfo_x86 *c)
detect_vmx_virtcap(c);
init_intel_energy_perf(c);
+
+ probe_xeon_phi_r3mwait(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index de6626c18e42..c55fb2cb2acc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/capability.h>
#include <linux/sysfs.h>
#include <linux/pci.h>
@@ -153,6 +154,7 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
unsigned long size;
struct amd_northbridge *nb;
};
@@ -894,6 +896,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
static void ci_leaf_init(struct cacheinfo *this_leaf,
struct _cpuid4_info_regs *base)
{
+ this_leaf->id = base->id;
+ this_leaf->attributes = CACHE_ID;
this_leaf->level = base->eax.split.level;
this_leaf->type = cache_type_map[base->eax.split.type];
this_leaf->coherency_line_size =
@@ -920,6 +924,22 @@ static int __init_cache_level(unsigned int cpu)
return 0;
}
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ id4_regs->id = c->apicid >> index_msb;
+}
+
static int __populate_cache_leaves(unsigned int cpu)
{
unsigned int idx, ret;
@@ -931,9 +951,12 @@ static int __populate_cache_leaves(unsigned int cpu)
ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
if (ret)
return ret;
+ get_cache_id(cpu, &id4_regs);
ci_leaf_init(this_leaf++, &id4_regs);
__cache_cpumap_setup(cpu, idx, &id4_regs);
}
+ this_cpu_ci->cpu_map_populated = true;
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000000..5a533fefefa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,403 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+ {
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 1
+ },
+ {
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 2,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+};
+
+static int cbm_idx(struct rdt_resource *r, int closid)
+{
+ return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return false;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
+
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return false;
+
+ r->num_closid = 4;
+ r->cbm_len = 20;
+ r->max_cbm = max_cbm;
+ r->min_cbm_bits = 2;
+ r->capable = true;
+ r->enabled = true;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void rdt_get_config(int idx, struct rdt_resource *r)
+{
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_1_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->cbm_len = eax.split.cbm_len + 1;
+ r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->capable = true;
+ r->enabled = true;
+}
+
+static void rdt_get_cdp_l3_config(int type)
+{
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &rdt_resources_all[type];
+
+ r->num_closid = r_l3->num_closid / 2;
+ r->cbm_len = r_l3->cbm_len;
+ r->max_cbm = r_l3->max_cbm;
+ r->capable = true;
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ r->enabled = false;
+}
+
+static inline bool get_rdt_resources(void)
+{
+ bool ret = false;
+
+ if (cache_alloc_hsw_probe())
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+ }
+ ret = true;
+ }
+ if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ ret = true;
+ }
+
+ return ret;
+}
+
+static int get_cache_id(int cpu, int level)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ int i;
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == level)
+ return ci->info_list[i].id;
+ }
+
+ return -1;
+}
+
+void rdt_cbm_update(void *arg)
+{
+ struct msr_param *m = (struct msr_param *)arg;
+ struct rdt_resource *r = m->res;
+ int i, cpu = smp_processor_id();
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ goto found;
+ }
+ pr_info_once("cpu %d not found in any domain for resource %s\n",
+ cpu, r->name);
+
+ return;
+
+found:
+ for (i = m->low; i < m->high; i++) {
+ int idx = cbm_idx(r, i);
+
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain *d;
+ struct list_head *l;
+
+ if (id < 0)
+ return ERR_PTR(id);
+
+ list_for_each(l, &r->domains) {
+ d = list_entry(l, struct rdt_domain, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ int i, id = get_cache_id(cpu, r->cache_level);
+ struct list_head *add_pos = NULL;
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, &add_pos);
+ if (IS_ERR(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ if (d) {
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ return;
+ }
+
+ d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+ if (!d)
+ return;
+
+ d->id = id;
+
+ d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
+ if (!d->cbm) {
+ kfree(d);
+ return;
+ }
+
+ for (i = 0; i < r->num_closid; i++) {
+ int idx = cbm_idx(r, i);
+
+ d->cbm[i] = r->max_cbm;
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ list_add_tail(&d->list, add_pos);
+ r->num_domains++;
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &d->cpu_mask);
+ if (cpumask_empty(&d->cpu_mask)) {
+ r->num_domains--;
+ kfree(d->cbm);
+ list_del(&d->list);
+ kfree(d);
+ }
+}
+
+static void clear_closid(int cpu)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ per_cpu(cpu_closid, cpu) = 0;
+ state->closid = 0;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ /* The cpu is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+ break;
+ }
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int __init intel_rdt_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret;
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/rdt/cat:online:",
+ intel_rdt_online_cpu, intel_rdt_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = rdtgroup_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+
+ for_each_capable_rdt_resource(r)
+ pr_info("Intel RDT %s allocation detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000000..9ac2a5cdd9c2
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,1115 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_common.h>
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int closid_free_map;
+
+static void closid_init(void)
+{
+ struct rdt_resource *r;
+ int rdt_min_closid = 32;
+
+ /* Compute rdt_min_closid across all resources */
+ for_each_enabled_rdt_resource(r)
+ rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+ closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+}
+
+int closid_alloc(void)
+{
+ int closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+static void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
+ int len)
+{
+ struct rftype *rft;
+ int ret;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts)
+ kernfs_remove_by_name(kn, rft->name);
+ return ret;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from rdt_update_closid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void rdt_update_cpu_closid(void *closid)
+{
+ if (closid)
+ this_cpu_write(cpu_closid, *(int *)closid);
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids must have been set up before calling this function.
+ *
+ * The per cpu closids are updated with the smp function call, when @closid
+ * is not NULL. If @closid is NULL then all affected percpu closids must
+ * have been set up before calling this function.
+ */
+static void
+rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_update_cpu_closid(closid);
+ smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+ put_cpu();
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask;
+ struct rdtgroup *rdtgrp, *r;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ ret = cpumask_parse(buf, newmask);
+ if (ret)
+ goto unlock;
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (cpumask_weight(tmpmask)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ rdt_update_closid(tmpmask, &rdtgroup_default.closid);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu closid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+ }
+ rdt_update_closid(tmpmask, &rdtgrp->closid);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+
+ return ret ?: nbytes;
+}
+
+struct task_move_callback {
+ struct callback_head work;
+ struct rdtgroup *rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+ struct task_move_callback *callback;
+ struct rdtgroup *rdtgrp;
+
+ callback = container_of(head, struct task_move_callback, work);
+ rdtgrp = callback->rdtgrp;
+
+ /*
+ * If resource group was deleted before this task work callback
+ * was invoked, then assign the task to root group and free the
+ * resource group.
+ */
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ current->closid = 0;
+ kfree(rdtgrp);
+ }
+
+ preempt_disable();
+ /* update PQR_ASSOC MSR to make resource group go into effect */
+ intel_rdt_sched_in();
+ preempt_enable();
+
+ kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ struct task_move_callback *callback;
+ int ret;
+
+ callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+ if (!callback)
+ return -ENOMEM;
+ callback->work.func = move_myself;
+ callback->rdtgrp = rdtgrp;
+
+ /*
+ * Take a refcount, so rdtgrp cannot be freed before the
+ * callback has been invoked.
+ */
+ atomic_inc(&rdtgrp->waitcount);
+ ret = task_work_add(tsk, &callback->work, true);
+ if (ret) {
+ /*
+ * Task is exiting. Drop the refcount and free the callback.
+ * No need to check the refcount as the group cannot be
+ * deleted before the write function unlocks rdtgroup_mutex.
+ */
+ atomic_dec(&rdtgrp->waitcount);
+ kfree(callback);
+ } else {
+ tsk->closid = rdtgrp->closid;
+ }
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EPERM;
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ else
+ ret = -ENOENT;
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if (t->closid == r->closid)
+ seq_printf(s, "%d\n", t->pid);
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/* Files in each rdtgroup */
+static struct rftype rdtgroup_base_files[] = {
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ },
+};
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_closid);
+
+ return 0;
+}
+
+static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->max_cbm);
+
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->min_cbm_bits);
+
+ return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_info_files[] = {
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_cbm_mask_show,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ },
+};
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct kernfs_node *kn_subdir;
+ struct rdt_resource *r;
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+ kernfs_get(kn_info);
+
+ for_each_enabled_rdt_resource(r) {
+ kn_subdir = kernfs_create_dir(kn_info, r->name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir)) {
+ ret = PTR_ERR(kn_subdir);
+ goto out_destroy;
+ }
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ goto out_destroy;
+ ret = rdtgroup_add_files(kn_subdir, res_info_files,
+ ARRAY_SIZE(res_info_files));
+ if (ret)
+ goto out_destroy;
+ kernfs_activate(kn_subdir);
+ }
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn_info);
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+{
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ }
+ cpu = get_cpu();
+ /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ l3_qos_cfg_update(&enable);
+ /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static int cdp_enable(void)
+{
+ struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+ struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ int ret;
+
+ if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+ return -EINVAL;
+
+ ret = set_l3_qos_cfg(r_l3, true);
+ if (!ret) {
+ r_l3->enabled = false;
+ r_l3data->enabled = true;
+ r_l3code->enabled = true;
+ }
+ return ret;
+}
+
+static void cdp_disable(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ r->enabled = r->capable;
+
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+ rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+ rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+ set_l3_qos_cfg(r, false);
+ }
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+ char *token, *o = data;
+ int ret = 0;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token)
+ return -EINVAL;
+
+ if (!strcmp(token, "cdp"))
+ ret = cdp_enable();
+ }
+
+ return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info || kn->parent == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return kn->parent->priv;
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ kernfs_unbreak_active_protection(kn);
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct dentry *dentry;
+ int ret;
+
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (static_branch_unlikely(&rdt_enable_key)) {
+ dentry = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ ret = parse_rdtgroupfs_options(data);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ closid_init();
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ dentry = kernfs_mount(fs_type, flags, rdt_root,
+ RDTGROUP_SUPER_MAGIC, NULL);
+ if (IS_ERR(dentry))
+ goto out_cdp;
+
+ static_branch_enable(&rdt_enable_key);
+ goto out;
+
+out_cdp:
+ cdp_disable();
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return dentry;
+}
+
+static int reset_all_cbms(struct rdt_resource *r)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int i, cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = r->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+ for (i = 0; i < r->num_closid; i++)
+ d->cbm[i] = r->max_cbm;
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || t->closid == from->closid) {
+ t->closid = to->closid;
+#ifdef CONFIG_SMP
+ /*
+ * This is safe on x86 w/o barriers as the ordering
+ * of writing to task_cpu() and t->on_cpu is
+ * reverse to the reading here. The detection is
+ * inaccurate as tasks might move or schedule
+ * before the smp function call takes place. In
+ * such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (mask && t->on_cpu)
+ cpumask_set_cpu(task_cpu(t), mask);
+#endif
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ get_online_cpus();
+ rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+ put_online_cpus();
+
+ kernfs_remove(kn_info);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /*Put everything back to default values. */
+ for_each_enabled_rdt_resource(r)
+ reset_all_cbms(r);
+ cdp_disable();
+ rmdir_all_sub();
+ static_branch_disable(&rdt_enable_key);
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .mount = rdt_mount,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *parent, *rdtgrp;
+ struct kernfs_node *kn;
+ int ret, closid;
+
+ /* Only allow mkdir in the root directory */
+ if (parent_kn != rdtgroup_default.kn)
+ return -EPERM;
+
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = rdtgroup_kn_lock_live(parent_kn);
+ if (!parent) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = closid_alloc();
+ if (ret < 0)
+ goto out_unlock;
+ closid = ret;
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ goto out_closid_free;
+ }
+ rdtgrp->closid = closid;
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_cancel_ref;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+ * here, which will be dropped inside rdtgroup_kn_unlock().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ ret = 0;
+ goto out_unlock;
+
+out_destroy:
+ kernfs_remove(rdtgrp->kn);
+out_cancel_ref:
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+out_closid_free:
+ closid_free(closid);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ int ret, cpu, closid = rdtgroup_default.closid;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(cpu_closid, cpu) = closid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ rdt_update_closid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ closid_free(rdtgrp->closid);
+ list_del(&rdtgrp->rdtgroup_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+ ret = 0;
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+ seq_puts(seq, ",cdp");
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .show_options = rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+ int ret;
+
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = 0;
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret) {
+ kernfs_destroy_root(rdt_root);
+ goto out;
+ }
+
+ rdtgroup_default.kn = rdt_root->kn;
+ kernfs_activate(rdtgroup_default.kn);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+ int ret = 0;
+
+ ret = rdtgroup_setup_root();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret)
+ goto cleanup_root;
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+ kernfs_destroy_root(rdt_root);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
new file mode 100644
index 000000000000..badd2b31a560
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -0,0 +1,245 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+ unsigned long first_bit, zero_bit;
+
+ if (var == 0 || var > r->max_cbm)
+ return false;
+
+ first_bit = find_first_bit(&var, r->cbm_len);
+ zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+
+ if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+ return false;
+
+ if ((zero_bit - first_bit) < r->min_cbm_bits)
+ return false;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+ unsigned long data;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &data);
+ if (ret)
+ return ret;
+ if (!cbm_validate(data, r))
+ return -EINVAL;
+ r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+ char *dom = NULL, *id;
+ struct rdt_domain *d;
+ unsigned long dom_id;
+
+ list_for_each_entry(d, &r->domains, list) {
+ dom = strsep(&line, ";");
+ if (!dom)
+ return -EINVAL;
+ id = strsep(&dom, "=");
+ if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+ return -EINVAL;
+ if (parse_cbm(dom, r))
+ return -EINVAL;
+ }
+
+ /* Any garbage at the end of the line? */
+ if (line && line[0])
+ return -EINVAL;
+ return 0;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu, idx = 0;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.low = closid;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on other cpus. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int closid, ret = 0;
+ u32 *l3_cbms = NULL;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ closid = rdtgrp->closid;
+
+ /* get scratch space to save all the masks while we validate input */
+ for_each_enabled_rdt_resource(r) {
+ r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+ GFP_KERNEL);
+ if (!r->tmp_cbms) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ r->num_tmp_cbms = 0;
+ }
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strsep(&tok, ":");
+ if (!tok) {
+ ret = -EINVAL;
+ goto out;
+ }
+ for_each_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) &&
+ closid < r->num_closid) {
+ ret = parse_line(tok, r);
+ if (ret)
+ goto out;
+ break;
+ }
+ }
+ if (!r->name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Did the parser find all the masks we need? */
+ for_each_enabled_rdt_resource(r) {
+ if (r->num_tmp_cbms != r->num_domains) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ for_each_enabled_rdt_resource(r) {
+ ret = update_domains(r, closid);
+ if (ret)
+ goto out;
+ }
+
+out:
+ for_each_enabled_rdt_resource(r) {
+ kfree(r->tmp_cbms);
+ r->tmp_cbms = NULL;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ seq_printf(s, "%s:", r->name);
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+ seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ int closid, ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ closid = rdtgrp->closid;
+ for_each_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 83f1a98d37db..2eee85379689 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -52,8 +52,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
- if (severity >= GHES_SEV_PANIC)
+
+ if (severity >= GHES_SEV_PANIC) {
m.status |= MCI_STATUS_PCC;
+ m.tsc = rdtsc();
+ }
m.addr = mem_err->physical_addr;
mce_log(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 93d824ec3120..217cd4449bc9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -72,7 +72,7 @@ struct llist_node *mce_gen_pool_prepare_records(void)
return new_head.first;
}
-void mce_gen_pool_process(void)
+void mce_gen_pool_process(struct work_struct *__unused)
{
struct llist_node *head;
struct mce_evt_llist *node, *tmp;
@@ -85,7 +85,7 @@ void mce_gen_pool_process(void)
head = llist_reverse_order(head);
llist_for_each_entry_safe(node, tmp, head, llnode) {
mce = &node->mce;
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 517619ea6498..99165b206df3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -152,7 +152,6 @@ static void raise_mce(struct mce *m)
if (context == MCJ_CTX_RANDOM)
return;
-#ifdef CONFIG_X86_LOCAL_APIC
if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
@@ -192,9 +191,7 @@ static void raise_mce(struct mce *m)
raise_local();
put_cpu();
put_online_cpus();
- } else
-#endif
- {
+ } else {
preempt_disable();
raise_local();
preempt_enable();
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index cd74a3f00aea..19592ba1a320 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,7 +13,7 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern struct blocking_notifier_head x86_mce_decoder_chain;
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -31,7 +31,7 @@ struct mce_evt_llist {
struct mce mce;
};
-void mce_gen_pool_process(void);
+void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
int mce_gen_pool_add(struct mce *mce);
int mce_gen_pool_init(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 631356c8cca4..87cc9ab7a13c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -14,7 +14,7 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <asm/mce.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "mce-internal.h"
@@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
*msg = s->msg;
s->covered = 1;
if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
+ if (tolerant < 1)
return MCE_PANIC_SEVERITY;
}
return s->sev;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..af44ebeb593f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -53,6 +54,8 @@
static DEFINE_MUTEX(mce_chrdev_read_mutex);
+static int mce_chrdev_open_count; /* #times opened */
+
#define mce_log_get_idx_check(p) \
({ \
RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
@@ -120,14 +123,13 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
- m->tsc = rdtsc();
/* We hope get_seconds stays lockless */
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -135,6 +137,9 @@ void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+ rdmsrl(MSR_PPIN, m->ppin);
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -207,19 +212,23 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+static atomic_t num_notifiers;
+
void mce_register_decode_chain(struct notifier_block *nb)
{
- /* Ensure SRAO notifier has the highest priority in the decode chain. */
- if (nb != &mce_srao_nb && nb->priority == INT_MAX)
- nb->priority -= 1;
+ atomic_inc(&num_notifiers);
+
+ WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
- atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
- atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+ atomic_dec(&num_notifiers);
+
+ blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -270,17 +279,17 @@ struct mca_msr_regs msr_ops = {
.misc = misc_reg
};
-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
{
- int ret = 0;
-
- pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ m->cs, m->ip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
@@ -308,15 +317,11 @@ static void print_mce(struct mce *m)
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
+}
- /*
- * Print out human-readable details about the MCE error,
- * (if the CPU has an implementation for that)
- */
- ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- if (ret == NOTIFY_STOP)
- return;
-
+static void print_mce(struct mce *m)
+{
+ __print_mce(m);
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
}
@@ -499,7 +504,7 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
- if (!mce_gen_pool_empty() && keventd_up())
+ if (!mce_gen_pool_empty())
schedule_work(&mce_work);
}
@@ -566,7 +571,37 @@ static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
}
static struct notifier_block mce_srao_nb = {
.notifier_call = srao_decode_notifier,
- .priority = INT_MAX,
+ .priority = MCE_PRIO_SRAO,
+};
+
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ /*
+ * Run the default notifier if we have only the SRAO
+ * notifier and us registered.
+ */
+ if (atomic_read(&num_notifiers) > 2)
+ return NOTIFY_DONE;
+
+ /* Don't print when mcelog is running */
+ if (mce_chrdev_open_count > 0)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = MCE_PRIO_LOWEST,
};
/*
@@ -667,6 +702,9 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_gather_info(&m, NULL);
+ if (flags & MCP_TIMESTAMP)
+ m.tsc = rdtsc();
+
for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -674,14 +712,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
barrier();
m.status = mce_rdmsrl(msr_ops.status(i));
if (!(m.status & MCI_STATUS_VAL))
continue;
-
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -696,9 +732,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_read_aux(&m, i);
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
-
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1109,6 +1142,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
goto out;
mce_gather_info(&m, regs);
+ m.tsc = rdtsc();
final = this_cpu_ptr(&mces_seen);
*final = m;
@@ -1275,41 +1309,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
#endif
/*
- * Action optional processing happens here (picking up
- * from the list of faulting pages that do_machine_check()
- * placed into the genpool).
- */
-static void mce_process_work(struct work_struct *dummy)
-{
- mce_gen_pool_process();
-}
-
-#ifdef CONFIG_X86_MCE_INTEL
-/***
- * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
- * @cpu: The CPU on which the event occurred.
- * @status: Event status information
- *
- * This function should be called by the thermal interrupt after the
- * event has been processed and the decision was made to log the event
- * further.
- *
- * The status parameter will be saved to the 'status' field of 'struct mce'
- * and historically has been the register value of the
- * MSR_IA32_THERMAL_STATUS (Intel) msr.
- */
-void mce_log_therm_throt_event(__u64 status)
-{
- struct mce m;
-
- mce_setup(&m);
- m.bank = MCE_THERMAL_BANK;
- m.status = status;
- mce_log(&m);
-}
-#endif /* CONFIG_X86_MCE_INTEL */
-
-/*
* Periodic polling timer for "silent" machine check errors. If the
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
@@ -1326,20 +1325,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-static void __restart_timer(struct timer_list *t, unsigned long interval)
+static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
unsigned long flags;
local_irq_save(flags);
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
+ if (!timer_pending(t) || time_before(when, t->expires))
+ mod_timer(t, round_jiffies(when));
local_irq_restore(flags);
}
@@ -1355,7 +1349,7 @@ static void mce_timer_fn(unsigned long data)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
@@ -1374,7 +1368,7 @@ static void mce_timer_fn(unsigned long data)
done:
__this_cpu_write(mce_next_interval, iv);
- __restart_timer(t, iv);
+ __start_timer(t, iv);
}
/*
@@ -1385,7 +1379,7 @@ void mce_timer_kick(unsigned long interval)
struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned long iv = __this_cpu_read(mce_next_interval);
- __restart_timer(t, interval);
+ __start_timer(t, interval);
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
@@ -1732,17 +1726,23 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
}
}
-static void mce_start_timer(unsigned int cpu, struct timer_list *t)
+static void mce_start_timer(struct timer_list *t)
{
unsigned long iv = check_interval * HZ;
if (mca_cfg.ignore_ce || !iv)
return;
- per_cpu(mce_next_interval, cpu) = iv;
+ this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+}
+
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ unsigned int cpu = smp_processor_id();
- t->expires = round_jiffies(jiffies + iv);
- add_timer_on(t, cpu);
+ setup_pinned_timer(t, mce_timer_fn, cpu);
}
static void __mcheck_cpu_init_timer(void)
@@ -1751,7 +1751,7 @@ static void __mcheck_cpu_init_timer(void)
unsigned int cpu = smp_processor_id();
setup_pinned_timer(t, mce_timer_fn, cpu);
- mce_start_timer(cpu, t);
+ mce_start_timer(t);
}
/* Handle unconfigured int18 (should never happen) */
@@ -1796,7 +1796,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
- __mcheck_cpu_init_timer();
+ __mcheck_cpu_setup_timer();
}
/*
@@ -1823,7 +1823,6 @@ void mcheck_cpu_clear(struct cpuinfo_x86 *c)
*/
static DEFINE_SPINLOCK(mce_chrdev_state_lock);
-static int mce_chrdev_open_count; /* #times opened */
static int mce_chrdev_open_exclu; /* already open exclusive? */
static int mce_chrdev_open(struct inode *inode, struct file *file)
@@ -2138,9 +2137,10 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
- INIT_WORK(&mce_work, mce_process_work);
+ INIT_WORK(&mce_work, mce_gen_pool_process);
init_irq_work(&mce_irq_work, mce_irq_work_cb);
return 0;
@@ -2255,8 +2255,6 @@ static struct bus_type mce_subsys = {
DEFINE_PER_CPU(struct device *, mce_device);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
@@ -2409,6 +2407,10 @@ static int mce_device_create(unsigned int cpu)
if (!mce_available(&boot_cpu_data))
return -EIO;
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
dev = kzalloc(sizeof *dev, GFP_KERNEL);
if (!dev)
return -ENOMEM;
@@ -2468,28 +2470,25 @@ static void mce_device_remove(unsigned int cpu)
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void mce_disable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
-
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_clear();
vendor_disable_error_reporting();
}
-static void mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
@@ -2499,45 +2498,43 @@ static void mce_reenable_cpu(void *h)
}
}
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int mce_cpu_dead(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct timer_list *t = &per_cpu(mce_timer, cpu);
+ mce_intel_hcpu_update(cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- mce_device_create(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_device_remove(cpu);
- mce_intel_hcpu_update(cpu);
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
- /* intentionally ignoring frozen here */
- if (!(action & CPU_TASKS_FROZEN))
- cmci_rediscover();
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- del_timer_sync(t);
- break;
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- mce_start_timer(cpu, t);
- break;
- }
+static int mce_cpu_online(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int ret;
- return NOTIFY_OK;
+ mce_device_create(cpu);
+
+ ret = mce_threshold_create_device(cpu);
+ if (ret) {
+ mce_device_remove(cpu);
+ return ret;
+ }
+ mce_reenable_cpu();
+ mce_start_timer(t);
+ return 0;
}
-static struct notifier_block mce_cpu_notifier = {
- .notifier_call = mce_cpu_callback,
-};
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+
+ mce_disable_cpu();
+ del_timer_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
static __init void mce_init_banks(void)
{
@@ -2559,8 +2556,8 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
+ enum cpuhp_state hp_online;
int err;
- int i = 0;
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
@@ -2578,23 +2575,16 @@ static __init int mcheck_init_device(void)
if (err)
goto err_out_mem;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = mce_device_create(i);
- if (err) {
- /*
- * Register notifier anyway (and do not unreg it) so
- * that we don't leave undeleted timers, see notifier
- * callback above.
- */
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
- goto err_device_create;
- }
- }
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+ hp_online = err;
register_syscore_ops(&mce_syscore_ops);
@@ -2607,16 +2597,10 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
+ cpuhp_remove_state(hp_online);
-err_device_create:
- /*
- * We didn't keep track of which devices were created above, but
- * even if we had, the set of online cpus might have changed.
- * Play safe and remove for every possible cpu, since
- * mce_device_remove() will do the right thing.
- */
- for_each_possible_cpu(i)
- mce_device_remove(i);
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
err_out_mem:
free_cpumask_var(mce_device_initialized);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9b5403462936..6e4a047e4b68 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -24,7 +24,6 @@
#include <asm/amd_nb.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -55,11 +54,13 @@
/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
+static bool thresholding_en;
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
"combined_unit",
- "",
+ "decode_unit",
"northbridge",
"execution_unit",
};
@@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
-struct smca_bank_name smca_bank_names[] = {
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
@@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = {
[SMCA_PSP] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(smca_bank_names);
-static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
/* ZN Core (HWID=0xB0) MCA types */
@@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
{ SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-struct smca_bank_info smca_banks[MAX_NR_BANKS];
+struct smca_bank smca_banks[MAX_NR_BANKS];
EXPORT_SYMBOL_GPL(smca_banks);
/*
@@ -142,35 +164,35 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-/*
- * CPU Initialization
- */
-
static void get_smca_bank_info(unsigned int bank)
{
unsigned int i, hwid_mcatype, cpu = smp_processor_id();
- struct smca_hwid_mcatype *type;
- u32 high, instanceId;
- u16 hwid, mcatype;
+ struct smca_hwid *s_hwid;
+ u32 high, instance_id;
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
- hwid = high & MCI_IPID_HWID;
- mcatype = (high & MCI_IPID_MCATYPE) >> 16;
- hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
- type = &smca_hwid_mcatypes[i];
- if (hwid_mcatype == type->hwid_mcatype) {
- smca_banks[bank].type = type;
- smca_banks[bank].type_instance = instanceId;
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+
+ WARN(smca_banks[bank].hwid,
+ "Bank %s already initialized!\n",
+ smca_get_name(s_hwid->bank_type));
+
+ smca_banks[bank].hwid = s_hwid;
+ smca_banks[bank].id = instance_id;
+ smca_banks[bank].sysfs_id = s_hwid->count++;
break;
}
}
@@ -533,6 +555,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+ u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+ /* We start from the normalized address */
+ u64 ret_addr = norm_addr;
+
+ u32 tmp;
+
+ u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+ u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+ u8 intlv_addr_sel, intlv_addr_bit;
+ u8 num_intlv_bits, hashed_bit;
+ u8 lgcy_mmio_hole_en, base = 0;
+ u8 cs_mask, cs_id = 0;
+ bool hash_enabled = false;
+
+ /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+ if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+ goto out_err;
+
+ /* Remove HiAddrOffset from normalized address, if enabled: */
+ if (tmp & BIT(0)) {
+ u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+ if (norm_addr >= hi_addr_offset) {
+ ret_addr -= hi_addr_offset;
+ base = 1;
+ }
+ }
+
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ /* Check if address range is valid. */
+ if (!(tmp & BIT(0))) {
+ pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+ __func__, tmp);
+ goto out_err;
+ }
+
+ lgcy_mmio_hole_en = tmp & BIT(1);
+ intlv_num_chan = (tmp >> 4) & 0xF;
+ intlv_addr_sel = (tmp >> 8) & 0x7;
+ dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+ /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+ if (intlv_addr_sel > 3) {
+ pr_err("%s: Invalid interleave address select %d.\n",
+ __func__, intlv_addr_sel);
+ goto out_err;
+ }
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ intlv_num_sockets = (tmp >> 8) & 0x1;
+ intlv_num_dies = (tmp >> 10) & 0x3;
+ dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+ intlv_addr_bit = intlv_addr_sel + 8;
+
+ /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+ switch (intlv_num_chan) {
+ case 0: intlv_num_chan = 0; break;
+ case 1: intlv_num_chan = 1; break;
+ case 3: intlv_num_chan = 2; break;
+ case 5: intlv_num_chan = 3; break;
+ case 7: intlv_num_chan = 4; break;
+
+ case 8: intlv_num_chan = 1;
+ hash_enabled = true;
+ break;
+ default:
+ pr_err("%s: Invalid number of interleaved channels %d.\n",
+ __func__, intlv_num_chan);
+ goto out_err;
+ }
+
+ num_intlv_bits = intlv_num_chan;
+
+ if (intlv_num_dies > 2) {
+ pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+ __func__, intlv_num_dies);
+ goto out_err;
+ }
+
+ num_intlv_bits += intlv_num_dies;
+
+ /* Add a bit if sockets are interleaved. */
+ num_intlv_bits += intlv_num_sockets;
+
+ /* Assert num_intlv_bits <= 4 */
+ if (num_intlv_bits > 4) {
+ pr_err("%s: Invalid interleave bits %d.\n",
+ __func__, num_intlv_bits);
+ goto out_err;
+ }
+
+ if (num_intlv_bits > 0) {
+ u64 temp_addr_x, temp_addr_i, temp_addr_y;
+ u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+ /*
+ * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+ * This is the fabric id for this coherent slave. Use
+ * umc/channel# as instance id of the coherent slave
+ * for FICAA.
+ */
+ if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+ goto out_err;
+
+ cs_fabric_id = (tmp >> 8) & 0xFF;
+ die_id_bit = 0;
+
+ /* If interleaved over more than 1 channel: */
+ if (intlv_num_chan) {
+ die_id_bit = intlv_num_chan;
+ cs_mask = (1 << die_id_bit) - 1;
+ cs_id = cs_fabric_id & cs_mask;
+ }
+
+ sock_id_bit = die_id_bit;
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (intlv_num_dies || intlv_num_sockets)
+ if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+ goto out_err;
+
+ /* If interleaved over more than 1 die. */
+ if (intlv_num_dies) {
+ sock_id_bit = die_id_bit + intlv_num_dies;
+ die_id_shift = (tmp >> 24) & 0xF;
+ die_id_mask = (tmp >> 8) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+ }
+
+ /* If interleaved over more than 1 socket. */
+ if (intlv_num_sockets) {
+ socket_id_shift = (tmp >> 28) & 0xF;
+ socket_id_mask = (tmp >> 16) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+ }
+
+ /*
+ * The pre-interleaved address consists of XXXXXXIIIYYYYY
+ * where III is the ID for this CS, and XXXXXXYYYYY are the
+ * address bits from the post-interleaved address.
+ * "num_intlv_bits" has been calculated to tell us how many "I"
+ * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+ * there are (where "I" starts).
+ */
+ temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+ temp_addr_i = (cs_id << intlv_addr_bit);
+ temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+ ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
+ }
+
+ /* Add dram base address */
+ ret_addr += dram_base_addr;
+
+ /* If legacy MMIO hole enabled */
+ if (lgcy_mmio_hole_en) {
+ if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+ goto out_err;
+
+ dram_hole_base = tmp & GENMASK(31, 24);
+ if (ret_addr >= dram_hole_base)
+ ret_addr += (BIT_ULL(32) - dram_hole_base);
+ }
+
+ if (hash_enabled) {
+ /* Save some parentheses and grab ls-bit at the end. */
+ hashed_bit = (ret_addr >> 12) ^
+ (ret_addr >> 18) ^
+ (ret_addr >> 21) ^
+ (ret_addr >> 30) ^
+ cs_id;
+
+ hashed_bit &= BIT(0);
+
+ if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+ ret_addr ^= BIT(intlv_addr_bit);
+ }
+
+ /* Is calculated system address is above DRAM limit address? */
+ if (ret_addr > dram_limit_addr)
+ goto out_err;
+
+ *sys_addr = ret_addr;
+ return 0;
+
+out_err:
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
static void
__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
{
@@ -556,7 +778,8 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
mce_setup(&m);
m.status = status;
- m.bank = bank;
+ m.bank = bank;
+ m.tsc = rdtsc();
if (threshold_err)
m.misc = misc;
@@ -593,14 +816,14 @@ static inline void __smp_deferred_error_interrupt(void)
deferred_error_int_vector();
}
-asmlinkage __visible void smp_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(void)
{
entering_irq();
__smp_deferred_error_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
{
entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
@@ -645,6 +868,7 @@ static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
+ struct thresh_restart tr;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -681,6 +905,11 @@ static void amd_threshold_interrupt(void)
log:
__log_error(bank, false, true, ((u64)high << 32) | low);
+
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ threshold_restart_bank(&tr);
}
/*
@@ -826,10 +1055,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- if (!smca_banks[bank].type)
+ if (!smca_banks[bank].hwid)
return NULL;
- bank_type = smca_banks[bank].type->bank_type;
+ bank_type = smca_banks[bank].hwid->bank_type;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
@@ -837,9 +1066,12 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return NULL;
}
+ if (smca_banks[bank].hwid->count == 1)
+ return smca_get_name(bank_type);
+
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_bank_names[bank_type].name,
- smca_banks[bank].type_instance);
+ "%s_%x", smca_get_name(bank_type),
+ smca_banks[bank].sysfs_id);
return buf_mcatype;
}
@@ -955,6 +1187,9 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
const char *name = get_name(bank, NULL);
int err = 0;
+ if (!dev)
+ return -ENODEV;
+
if (is_shared_bank(bank)) {
nb = node_to_amd_nb(amd_get_nb_id(cpu));
@@ -1010,31 +1245,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
return err;
}
-/* create dir/files for all valid threshold banks */
-static int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- struct threshold_bank **bp;
- int err = 0;
-
- bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
- GFP_KERNEL);
- if (!bp)
- return -ENOMEM;
-
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- return err;
- }
-
- return err;
-}
-
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
@@ -1102,48 +1312,71 @@ free_out:
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
-static void threshold_remove_device(unsigned int cpu)
+int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
+ if (!thresholding_en)
+ return 0;
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
kfree(per_cpu(threshold_banks, cpu));
+ per_cpu(threshold_banks, cpu) = NULL;
+ return 0;
}
-/* get notified when a cpu comes on/off */
-static void
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
+ unsigned int bank;
+ struct threshold_bank **bp;
+ int err = 0;
+
+ if (!thresholding_en)
+ return 0;
+
+ bp = per_cpu(threshold_banks, cpu);
+ if (bp)
+ return 0;
+
+ bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto err;
}
+ return err;
+err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
+ if (mce_threshold_vector == amd_threshold_interrupt)
+ thresholding_en = true;
+
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
+ int err = mce_threshold_create_device(lcpu);
if (err)
return err;
}
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..190b3e6cef4d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -11,6 +11,8 @@
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void)
* Reset the counter if we've logged an error in the last poll
* during the storm.
*/
- if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
else
this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +344,7 @@ void cmci_recheck(void)
return;
local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
@@ -464,11 +466,46 @@ static void intel_clear_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
}
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ /*
+ * Even if testing the presence of the MSR would be enough, we don't
+ * want to risk the situation where other models reuse this MSR for
+ * other purposes.
+ */
+ switch (c->x86_model) {
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SKYLAKE_X:
+ if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+ return;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN available but disabled: */
+ return;
+ }
+
+ /* If PPIN is disabled, but not locked, try to enable: */
+ if (!(val & 3UL)) {
+ wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_PPIN_CTL, &val);
+ }
+
+ if ((val & 3UL) == 2UL)
+ set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+ }
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
+ intel_ppin_init(c);
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6b9dc4d18ccc..d7cc190ae457 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -6,7 +6,7 @@
*
* Maintains a counter in /sys that keeps track of the number of thermal
* events, such that the user knows how bad the thermal problem might be
- * (since the logging to syslog and mcelog is rate limited).
+ * (since the logging to syslog is rate limited).
*
* Author: Dmitriy Zavin (dmitriyz@google.com)
*
@@ -26,7 +26,6 @@
#include <asm/processor.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -142,13 +141,8 @@ static struct attribute_group thermal_attr_group = {
* IRQ has been acknowledged.
*
* It will take care of rate limiting and printing messages to the syslog.
- *
- * Returns: 0 : Event should NOT be further logged, i.e. still in
- * "timeout" from previous log message.
- * 1 : Event should be logged further, and a message has been
- * printed to the syslog.
*/
-static int therm_throt_process(bool new_event, int event, int level)
+static void therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
@@ -163,16 +157,16 @@ static int therm_throt_process(bool new_event, int event, int level)
else if (event == POWER_LIMIT_EVENT)
state = &pstate->core_power_limit;
else
- return 0;
+ return;
} else if (level == PACKAGE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->package_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->package_power_limit;
else
- return 0;
+ return;
} else
- return 0;
+ return;
old_event = state->new_event;
state->new_event = new_event;
@@ -182,7 +176,7 @@ static int therm_throt_process(bool new_event, int event, int level)
if (time_before64(now, state->next_check) &&
state->count != state->last_count)
- return 0;
+ return;
state->next_check = now + CHECK_INTERVAL;
state->last_count = state->count;
@@ -194,16 +188,14 @@ static int therm_throt_process(bool new_event, int event, int level)
this_cpu,
level == CORE_LEVEL ? "Core" : "Package",
state->count);
- return 1;
+ return;
}
if (old_event) {
if (event == THERMAL_THROTTLING_EVENT)
pr_info("CPU%d: %s temperature/speed normal\n", this_cpu,
level == CORE_LEVEL ? "Core" : "Package");
- return 1;
+ return;
}
-
- return 0;
}
static int thresh_event_valid(int level, int event)
@@ -271,58 +263,32 @@ static void thermal_throttle_remove_dev(struct device *dev)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int thermal_throttle_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
- int err = 0;
-
- dev = get_cpu_device(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = thermal_throttle_add_dev(dev, cpu);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- thermal_throttle_remove_dev(dev);
- break;
- }
- return notifier_from_errno(err);
+ struct device *dev = get_cpu_device(cpu);
+
+ return thermal_throttle_add_dev(dev, cpu);
}
-static struct notifier_block thermal_throttle_cpu_notifier =
+static int thermal_throttle_offline(unsigned int cpu)
{
- .notifier_call = thermal_throttle_cpu_callback,
-};
+ struct device *dev = get_cpu_device(cpu);
+
+ thermal_throttle_remove_dev(dev);
+ return 0;
+}
static __init int thermal_throttle_init_device(void)
{
- unsigned int cpu = 0;
- int err;
+ int ret;
if (!atomic_read(&therm_throt_en))
return 0;
- cpu_notifier_register_begin();
-
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
- WARN_ON(err);
- }
-
- __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
- cpu_notifier_register_done();
-
- return 0;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+ thermal_throttle_online,
+ thermal_throttle_offline);
+ return ret < 0 ? ret : 0;
}
device_initcall(thermal_throttle_init_device);
@@ -392,10 +358,9 @@ static void intel_thermal_interrupt(void)
/* Check for violation of core thermal thresholds*/
notify_thresholds(msr_val);
- if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
- THERMAL_THROTTLING_EVENT,
- CORE_LEVEL) != 0)
- mce_log_therm_throt_event(msr_val);
+ therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
@@ -431,14 +396,16 @@ static inline void __smp_thermal_interrupt(void)
smp_thermal_vector();
}
-asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void __irq_entry
+smp_trace_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fcf9ae9384f4..bb0e75eed10a 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -6,7 +6,6 @@
#include <asm/irq_vectors.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
@@ -24,14 +23,14 @@ static inline void __smp_threshold_interrupt(void)
mce_threshold_vector();
}
-asmlinkage __visible void smp_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
exiting_ack_irq();
}
-asmlinkage __visible void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 220b1a508513..ba12e8aa4a45 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,4 +1,4 @@
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 017bda12caae..7889ae492af0 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -5,6 +5,7 @@
* CPUs and later.
*
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
@@ -39,100 +40,149 @@
static struct equiv_cpu_entry *equiv_cpu_table;
-struct ucode_patch {
- struct list_head plist;
- void *data;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
/*
* This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents. @mc is the
+ * microcode patch we found to match.
*/
-static u8 *container;
-static size_t container_size;
-static bool ucode_builtin;
+struct cont_desc {
+ struct microcode_amd *mc;
+ u32 cpuid_1_eax;
+ u32 psize;
+ u8 *data;
+ size_t size;
+};
static u32 ucode_new_rev;
static u8 amd_ucode_patch[PATCH_MAX_SIZE];
-static u16 this_equiv_id;
-static struct cpio_data ucode_cpio;
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
-static struct cpio_data __init find_ucode_in_initrd(void)
+static u16 find_equiv_id(struct equiv_cpu_entry *equiv_table, u32 sig)
{
-#ifdef CONFIG_BLK_DEV_INITRD
- char *path;
- void *start;
- size_t size;
+ for (; equiv_table && equiv_table->installed_cpu; equiv_table++) {
+ if (sig == equiv_table->installed_cpu)
+ return equiv_table->equiv_cpu;
+ }
- /*
- * Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/early-microcode.txt
- */
- static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
+ return 0;
+}
-#ifdef CONFIG_X86_32
- struct boot_params *p;
+/*
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
+ * Returns the amount of bytes consumed while scanning. @desc contains all the
+ * data we're going to use in later stages of the application.
+ */
+static ssize_t parse_container(u8 *ucode, ssize_t size, struct cont_desc *desc)
+{
+ struct equiv_cpu_entry *eq;
+ ssize_t orig_size = size;
+ u32 *hdr = (u32 *)ucode;
+ u16 eq_id;
+ u8 *buf;
- /*
- * On 32-bit, early load occurs before paging is turned on so we need
- * to use physical addresses.
- */
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- path = (char *)__pa_nodebug(ucode_path);
- start = (void *)p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-#else
- path = ucode_path;
- start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
- size = boot_params.hdr.ramdisk_size;
-#endif /* !CONFIG_X86_32 */
+ /* Am I looking at an equivalence table header? */
+ if (hdr[0] != UCODE_MAGIC ||
+ hdr[1] != UCODE_EQUIV_CPU_TABLE_TYPE ||
+ hdr[2] == 0)
+ return CONTAINER_HDR_SZ;
- return find_cpio_data(path, start, size, NULL);
-#else
- return (struct cpio_data){ NULL, 0, "" };
-#endif
-}
+ buf = ucode;
-static size_t compute_container_size(u8 *data, u32 total_size)
-{
- size_t size = 0;
- u32 *header = (u32 *)data;
+ eq = (struct equiv_cpu_entry *)(buf + CONTAINER_HDR_SZ);
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return size;
+ /* Find the equivalence ID of our CPU in this table: */
+ eq_id = find_equiv_id(eq, desc->cpuid_1_eax);
- size = header[2] + CONTAINER_HDR_SZ;
- total_size -= size;
- data += size;
+ buf += hdr[2] + CONTAINER_HDR_SZ;
+ size -= hdr[2] + CONTAINER_HDR_SZ;
- while (total_size) {
- u16 patch_size;
+ /*
+ * Scan through the rest of the container to find where it ends. We do
+ * some basic sanity-checking too.
+ */
+ while (size > 0) {
+ struct microcode_amd *mc;
+ u32 patch_size;
- header = (u32 *)data;
+ hdr = (u32 *)buf;
- if (header[0] != UCODE_UCODE_TYPE)
+ if (hdr[0] != UCODE_UCODE_TYPE)
break;
- /*
- * Sanity-check patch size.
- */
- patch_size = header[1];
+ /* Sanity-check patch size. */
+ patch_size = hdr[1];
if (patch_size > PATCH_MAX_SIZE)
break;
- size += patch_size + SECTION_HDR_SIZE;
- data += patch_size + SECTION_HDR_SIZE;
- total_size -= patch_size + SECTION_HDR_SIZE;
+ /* Skip patch section header: */
+ buf += SECTION_HDR_SIZE;
+ size -= SECTION_HDR_SIZE;
+
+ mc = (struct microcode_amd *)buf;
+ if (eq_id == mc->hdr.processor_rev_id) {
+ desc->psize = patch_size;
+ desc->mc = mc;
+ }
+
+ buf += patch_size;
+ size -= patch_size;
}
- return size;
+ /*
+ * If we have found a patch (desc->mc), it means we're looking at the
+ * container which has a patch for this CPU so return 0 to mean, @ucode
+ * already points to the proper container. Otherwise, we return the size
+ * we scanned so that we can advance to the next container in the
+ * buffer.
+ */
+ if (desc->mc) {
+ desc->data = ucode;
+ desc->size = orig_size - size;
+
+ return 0;
+ }
+
+ return orig_size - size;
+}
+
+/*
+ * Scan the ucode blob for the proper container as we can have multiple
+ * containers glued together.
+ */
+static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
+{
+ ssize_t rem = size;
+
+ while (rem >= 0) {
+ ssize_t s = parse_container(ucode, rem, desc);
+ if (!s)
+ return;
+
+ ucode += s;
+ rem -= s;
+ }
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc->hdr.patch_id)
+ return -1;
+
+ return 0;
}
/*
@@ -143,128 +193,50 @@ static size_t compute_container_size(u8 *data, u32 total_size)
* and on 32-bit during save_microcode_in_initrd_amd() -- we can call
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
+ *
+ * Returns true if container found (sets @desc), false otherwise.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+static bool
+apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_patch)
{
- struct equiv_cpu_entry *eq;
- size_t *cont_sz;
- u32 *header;
- u8 *data, **cont;
+ struct cont_desc desc = { 0 };
u8 (*patch)[PATCH_MAX_SIZE];
- u16 eq_id = 0;
- int offset, left;
- u32 rev, eax, ebx, ecx, edx;
- u32 *new_rev;
+ struct microcode_amd *mc;
+ u32 rev, dummy, *new_rev;
+ bool ret = false;
#ifdef CONFIG_X86_32
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- cont_sz = (size_t *)__pa_nodebug(&container_size);
- cont = (u8 **)__pa_nodebug(&container);
patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
- cont_sz = &container_size;
- cont = &container;
patch = &amd_ucode_patch;
#endif
- data = ucode;
- left = size;
- header = (u32 *)data;
-
- /* find equiv cpu table */
- if (header[0] != UCODE_MAGIC ||
- header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
- header[2] == 0) /* size */
- return;
+ desc.cpuid_1_eax = cpuid_1_eax;
- eax = 0x00000001;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
+ scan_containers(ucode, size, &desc);
- while (left > 0) {
- eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
-
- *cont = data;
-
- /* Advance past the container header */
- offset = header[2] + CONTAINER_HDR_SZ;
- data += offset;
- left -= offset;
-
- eq_id = find_equiv_id(eq, eax);
- if (eq_id) {
- this_equiv_id = eq_id;
- *cont_sz = compute_container_size(*cont, left + offset);
-
- /*
- * truncate how much we need to iterate over in the
- * ucode update loop below
- */
- left = *cont_sz - offset;
- break;
- }
+ mc = desc.mc;
+ if (!mc)
+ return ret;
- /*
- * support multiple container files appended together. if this
- * one does not have a matching equivalent cpu entry, we fast
- * forward to the next container file.
- */
- while (left > 0) {
- header = (u32 *)data;
- if (header[0] == UCODE_MAGIC &&
- header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
- break;
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev >= mc->hdr.patch_id)
+ return ret;
- /* mark where the next microcode container file starts */
- offset = data - (u8 *)ucode;
- ucode = data;
- }
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ ret = true;
- if (!eq_id) {
- *cont = NULL;
- *cont_sz = 0;
- return;
+ if (save_patch)
+ memcpy(patch, mc, min_t(u32, desc.psize, PATCH_MAX_SIZE));
}
- if (check_current_patch_level(&rev, true))
- return;
-
- while (left > 0) {
- struct microcode_amd *mc;
-
- header = (u32 *)data;
- if (header[0] != UCODE_UCODE_TYPE || /* type */
- header[1] == 0) /* size */
- break;
-
- mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
-
- if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
-
- if (!__apply_microcode_amd(mc)) {
- rev = mc->hdr.patch_id;
- *new_rev = rev;
-
- if (save_patch)
- memcpy(patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
- }
- }
-
- offset = header[1] + SECTION_HDR_SIZE;
- data += offset;
- left -= offset;
- }
+ return ret;
}
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
- unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -279,207 +251,113 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
#endif
}
-void __init load_ucode_amd_bsp(unsigned int family)
+void __load_ucode_amd(unsigned int cpuid_1_eax, struct cpio_data *ret)
{
+ struct ucode_cpu_info *uci;
struct cpio_data cp;
- bool *builtin;
- void **data;
- size_t *size;
-
-#ifdef CONFIG_X86_32
- data = (void **)__pa_nodebug(&ucode_cpio.data);
- size = (size_t *)__pa_nodebug(&ucode_cpio.size);
- builtin = (bool *)__pa_nodebug(&ucode_builtin);
-#else
- data = &ucode_cpio.data;
- size = &ucode_cpio.size;
- builtin = &ucode_builtin;
-#endif
+ const char *path;
+ bool use_pa;
- *builtin = load_builtin_amd_microcode(&cp, family);
- if (!*builtin)
- cp = find_ucode_in_initrd();
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ uci = ucode_cpu_info;
+ path = ucode_path;
+ use_pa = false;
+ }
- if (!(cp.data && cp.size))
- return;
+ if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ cp = find_microcode_in_initrd(path, use_pa);
- *data = cp.data;
- *size = cp.size;
+ /* Needed in load_microcode_amd() */
+ uci->cpu_sig.sig = cpuid_1_eax;
- apply_ucode_in_initrd(cp.data, cp.size, true);
+ *ret = cp;
}
-#ifdef CONFIG_X86_32
-/*
- * On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
- * which is used upon resume from suspend.
- */
-void load_ucode_amd_ap(void)
+void __init load_ucode_amd_bsp(unsigned int cpuid_1_eax)
{
- struct microcode_amd *mc;
- size_t *usize;
- void **ucode;
-
- mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
- if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
- __apply_microcode_amd(mc);
- return;
- }
+ struct cpio_data cp = { };
- ucode = (void *)__pa_nodebug(&container);
- usize = (size_t *)__pa_nodebug(&container_size);
-
- if (!*ucode || !*usize)
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
return;
- apply_ucode_in_initrd(*ucode, *usize, false);
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, true);
}
-static void __init collect_cpu_sig_on_bsp(void *arg)
+void load_ucode_amd_ap(unsigned int cpuid_1_eax)
{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
-}
-#else
-void load_ucode_amd_ap(void)
-{
- unsigned int cpu = smp_processor_id();
- struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
- u8 *cont = container;
- u32 rev, eax;
- u16 eq_id;
-
- /* Exit if called on the BSP. */
- if (!cpu)
- return;
-
- if (!container)
- return;
-
- /*
- * 64-bit runs with paging enabled, thus early==false.
- */
- if (check_current_patch_level(&rev, false))
- return;
-
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
-
- eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
+ struct cpio_data cp;
+ u32 *new_rev, rev, dummy;
- eq_id = find_equiv_id(eq, eax);
- if (!eq_id)
- return;
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ } else {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+ new_rev = &ucode_new_rev;
+ }
- if (eq_id == this_equiv_id) {
- mc = (struct microcode_amd *)amd_ucode_patch;
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (mc && rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc))
- ucode_new_rev = mc->hdr.patch_id;
+ /* Check whether we have saved a new patch already: */
+ if (*new_rev && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc)) {
+ *new_rev = mc->hdr.patch_id;
+ return;
}
+ }
- } else {
- if (!ucode_cpio.data)
- return;
+ __load_ucode_amd(cpuid_1_eax, &cp);
+ if (!(cp.data && cp.size))
+ return;
- /*
- * AP has a different equivalence ID than BSP, looks like
- * mixed-steppings silicon so go through the ucode blob anew.
- */
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
- }
+ apply_microcode_early_amd(cpuid_1_eax, cp.data, cp.size, false);
}
-#endif
-int __init save_microcode_in_initrd_amd(void)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+
+int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
- unsigned long cont;
- int retval = 0;
+ struct cont_desc desc = { 0 };
enum ucode_state ret;
- u8 *cont_va;
- u32 eax;
+ struct cpio_data cp;
- if (!container)
+ cp = find_microcode_in_initrd(ucode_path, false);
+ if (!(cp.data && cp.size))
return -EINVAL;
-#ifdef CONFIG_X86_32
- get_bsp_sig();
- cont = (unsigned long)container;
- cont_va = __va(container);
-#else
- /*
- * We need the physical address of the container for both bitness since
- * boot_params.hdr.ramdisk_image is a physical address.
- */
- cont = __pa_nodebug(container);
- cont_va = container;
-#endif
-
- /*
- * Take into account the fact that the ramdisk might get relocated and
- * therefore we need to recompute the container's position in virtual
- * memory space.
- */
- if (relocated_ramdisk)
- container = (u8 *)(__va(relocated_ramdisk) +
- (cont - boot_params.hdr.ramdisk_image));
- else
- container = cont_va;
-
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+ desc.cpuid_1_eax = cpuid_1_eax;
- eax = cpuid_eax(0x00000001);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+ ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
+ desc.data, desc.size);
if (ret != UCODE_OK)
- retval = -EINVAL;
-
- /*
- * This will be freed any msec now, stash patches for the current
- * family and switch to patch cache for cpu hotplug, etc later.
- */
- container = NULL;
- container_size = 0;
+ return -EINVAL;
- return retval;
+ return 0;
}
void reload_ucode_amd(void)
{
struct microcode_amd *mc;
- u32 rev;
+ u32 rev, dummy;
- /*
- * early==false because this is a syscore ->resume path and by
- * that time paging is long enabled.
- */
- if (check_current_patch_level(&rev, false))
+ mc = (struct microcode_amd *)amd_ucode_patch;
+ if (!mc)
return;
- mc = (struct microcode_amd *)amd_ucode_patch;
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (mc && rev < mc->hdr.patch_id) {
+ if (rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
@@ -513,7 +391,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist)
+ list_for_each_entry(p, &microcode_cache, plist)
if (p->equiv_cpu == equiv_cpu)
return p;
return NULL;
@@ -523,7 +401,7 @@ static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist) {
+ list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
if (p->patch_id >= new_patch->patch_id)
/* we already have the latest patch */
@@ -536,14 +414,14 @@ static void update_cache(struct ucode_patch *new_patch)
}
}
/* no patch found, add it */
- list_add_tail(&new_patch->plist, &pcache);
+ list_add_tail(&new_patch->plist, &microcode_cache);
}
static void free_cache(void)
{
struct ucode_patch *p, *tmp;
- list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
__list_del(p->plist.prev, p->plist.next);
kfree(p->data);
kfree(p);
@@ -616,74 +494,13 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
return patch_size;
}
-/*
- * Those patch levels cannot be updated to newer ones and thus should be final.
- */
-static u32 final_levels[] = {
- 0x01000098,
- 0x0100009f,
- 0x010000af,
- 0, /* T-101 terminator */
-};
-
-/*
- * Check the current patch level on this CPU.
- *
- * @rev: Use it to return the patch level. It is set to 0 in the case of
- * error.
- *
- * Returns:
- * - true: if update should stop
- * - false: otherwise
- */
-bool check_current_patch_level(u32 *rev, bool early)
-{
- u32 lvl, dummy, i;
- bool ret = false;
- u32 *levels;
-
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
-
- if (IS_ENABLED(CONFIG_X86_32) && early)
- levels = (u32 *)__pa_nodebug(&final_levels);
- else
- levels = final_levels;
-
- for (i = 0; levels[i]; i++) {
- if (lvl == levels[i]) {
- lvl = 0;
- ret = true;
- break;
- }
- }
-
- if (rev)
- *rev = lvl;
-
- return ret;
-}
-
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc_amd->hdr.patch_id)
- return -1;
-
- return 0;
-}
-
-int apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_amd *mc_amd;
struct ucode_cpu_info *uci;
struct ucode_patch *p;
- u32 rev;
+ u32 rev, dummy;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -696,8 +513,7 @@ int apply_microcode_amd(int cpu)
mc_amd = p->data;
uci->mc = p->data;
- if (check_current_patch_level(&rev, false))
- return -1;
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
/* need to apply patch? */
if (rev >= mc_amd->hdr.patch_id) {
@@ -860,7 +676,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5ce5155f0695..b4a4cd39b358 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
- * 2013-2015 Borislav Petkov <bp@alien8.de>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* X86 CPU microcode early update for Linux:
*
@@ -39,11 +39,16 @@
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/setup.h>
-#define MICROCODE_VERSION "2.01"
+#define DRIVER_VERSION "2.2"
static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr;
+static bool dis_ucode_ldr = true;
+
+bool initrd_gone;
+
+LIST_HEAD(microcode_cache);
/*
* Synchronization.
@@ -61,15 +66,47 @@ static DEFINE_MUTEX(microcode_mutex);
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-/*
- * Operations that are run on a target cpu:
- */
-
struct cpu_info_ctx {
struct cpu_signature *cpu_sig;
int err;
};
+/*
+ * Those patch levels cannot be updated to newer ones and thus should be final.
+ */
+static u32 final_levels[] = {
+ 0x01000098,
+ 0x0100009f,
+ 0x010000af,
+ 0, /* T-101 terminator */
+};
+
+/*
+ * Check the current patch level on this CPU.
+ *
+ * Returns:
+ * - true: if update should stop
+ * - false: otherwise
+ */
+static bool amd_check_current_patch_level(void)
+{
+ u32 lvl, dummy, i;
+ u32 *levels;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy);
+
+ if (IS_ENABLED(CONFIG_X86_32))
+ levels = (u32 *)__pa_nodebug(&final_levels);
+ else
+ levels = final_levels;
+
+ for (i = 0; levels[i]; i++) {
+ if (lvl == levels[i])
+ return true;
+ }
+ return false;
+}
+
static bool __init check_loader_disabled_bsp(void)
{
static const char *__dis_opt_str = "dis_ucode_ldr";
@@ -85,8 +122,24 @@ static bool __init check_loader_disabled_bsp(void)
bool *res = &dis_ucode_ldr;
#endif
- if (cmdline_find_option_bool(cmdline, option))
- *res = true;
+ if (!have_cpuid_p())
+ return *res;
+
+ /*
+ * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+ * completely accurate as xen pv guests don't see that CPUID bit set but
+ * that's good enough as they don't land on the BSP path anyway.
+ */
+ if (native_cpuid_ecx(1) & BIT(31))
+ return *res;
+
+ if (x86_cpuid_vendor() == X86_VENDOR_AMD) {
+ if (amd_check_current_patch_level())
+ return *res;
+ }
+
+ if (cmdline_find_option_bool(cmdline, option) <= 0)
+ *res = false;
return *res;
}
@@ -112,26 +165,21 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name)
void __init load_ucode_bsp(void)
{
- int vendor;
- unsigned int family;
+ unsigned int cpuid_1_eax;
if (check_loader_disabled_bsp())
return;
- if (!have_cpuid_p())
- return;
+ cpuid_1_eax = native_cpuid_eax(1);
- vendor = x86_cpuid_vendor();
- family = x86_cpuid_family();
-
- switch (vendor) {
+ switch (x86_cpuid_vendor()) {
case X86_VENDOR_INTEL:
- if (family >= 6)
+ if (x86_family(cpuid_1_eax) >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_bsp(family);
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_bsp(cpuid_1_eax);
break;
default:
break;
@@ -149,25 +197,21 @@ static bool check_loader_disabled_ap(void)
void load_ucode_ap(void)
{
- int vendor, family;
+ unsigned int cpuid_1_eax;
if (check_loader_disabled_ap())
return;
- if (!have_cpuid_p())
- return;
+ cpuid_1_eax = native_cpuid_eax(1);
- vendor = x86_cpuid_vendor();
- family = x86_cpuid_family();
-
- switch (vendor) {
+ switch (x86_cpuid_vendor()) {
case X86_VENDOR_INTEL:
- if (family >= 6)
+ if (x86_family(cpuid_1_eax) >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
- if (family >= 0x10)
- load_ucode_amd_ap();
+ if (x86_family(cpuid_1_eax) >= 0x10)
+ load_ucode_amd_ap(cpuid_1_eax);
break;
default:
break;
@@ -177,21 +221,81 @@ void load_ucode_ap(void)
static int __init save_microcode_in_initrd(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ int ret = -EINVAL;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
if (c->x86 >= 6)
- return save_microcode_in_initrd_intel();
+ ret = save_microcode_in_initrd_intel();
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
- return save_microcode_in_initrd_amd();
+ return save_microcode_in_initrd_amd(cpuid_eax(1));
break;
default:
break;
}
- return -EINVAL;
+ initrd_gone = true;
+
+ return ret;
+}
+
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *params;
+
+ if (use_pa)
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ else
+ params = &boot_params;
+
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ if (size)
+ start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ *
+ * initrd_gone is for the hotplug case where we've thrown out initrd
+ * already.
+ */
+ if (!use_pa) {
+ if (initrd_gone)
+ return (struct cpio_data){ NULL, 0, "" };
+ if (initrd_start)
+ start = initrd_start;
+ }
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
}
void reload_early_microcode(void)
@@ -453,16 +557,17 @@ static struct attribute_group mc_attr_group = {
static void microcode_fini_cpu(int cpu)
{
- microcode_ops->microcode_fini_cpu(cpu);
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
- pr_debug("CPU%d updated upon resume\n", cpu);
-
if (apply_microcode_on_target(cpu))
return UCODE_ERROR;
+ pr_debug("CPU%d updated upon resume\n", cpu);
+
return UCODE_OK;
}
@@ -496,6 +601,9 @@ static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ /* Refresh CPU microcode revision after resume. */
+ collect_cpu_info(cpu);
+
if (uci->valid)
return microcode_resume_cpu(cpu);
@@ -579,12 +687,7 @@ static int mc_cpu_down_prep(unsigned int cpu)
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
- /*
- * When a CPU goes offline, don't free up or invalidate the copy of
- * the microcode in kernel memory, so that we can reuse it when the
- * CPU comes back online without unnecessarily requesting the userspace
- * for it again.
- */
+
return 0;
}
@@ -649,8 +752,7 @@ int __init microcode_init(void)
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+ pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
return 0;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..8325d8a09ab0 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,125 +39,83 @@
#include <asm/setup.h>
#include <asm/msr.h>
-/*
- * Temporary microcode blobs pointers storage. We note here during early load
- * the pointers to microcode blobs we've got from whatever storage (detached
- * initrd, builtin). Later on, we put those into final storage
- * mc_saved_data.mc_saved.
- *
- * Important: those are offsets from the beginning of initrd or absolute
- * addresses within the kernel image when built-in.
- */
-static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
-
-static struct mc_saved_data {
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
-} mc_saved_data;
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
-/* Microcode blobs within the initrd. 0 if builtin. */
-static struct ucode_blobs {
- unsigned long start;
- bool valid;
-} blobs;
+/* Current microcode patch used in early patching on the APs. */
+struct microcode_intel *intel_ucode_patch;
-/* Go through saved patches and find the one suitable for the current CPU. */
-static enum ucode_state
-find_microcode_patch(struct microcode_intel **saved,
- unsigned int num_saved, struct ucode_cpu_info *uci)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
{
- struct microcode_intel *ucode_ptr, *new_mc = NULL;
- struct microcode_header_intel *mc_hdr;
- int new_rev, ret, i;
-
- new_rev = uci->cpu_sig.rev;
-
- for (i = 0; i < num_saved; i++) {
- ucode_ptr = saved[i];
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- ret = has_newer_microcode(ucode_ptr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- new_rev);
- if (!ret)
- continue;
-
- new_rev = mc_hdr->rev;
- new_mc = ucode_ptr;
- }
+ if (s1 != s2)
+ return false;
- if (!new_mc)
- return UCODE_NFOUND;
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
- uci->mc = (struct microcode_intel *)new_mc;
- return UCODE_OK;
+ /* ... or they intersect. */
+ return p1 & p2;
}
-static inline void
-copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
- unsigned long off, int num_saved)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
int i;
- for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
-{
- int i;
- struct microcode_intel ***mc_saved;
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
- mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
- for (i = 0; i < mcs->num_saved; i++) {
- struct microcode_intel *p;
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
- p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
- mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
}
+ return 0;
}
-#endif
-static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- unsigned long offset, struct ucode_cpu_info *uci)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mcs->num_saved;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!mcs->mc_saved) {
- copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
+ if (mc_hdr->rev <= new_rev)
+ return 0;
- return find_microcode_patch(mc_saved_tmp, count, uci);
- } else {
-#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mcs);
- return find_microcode_patch(mc_saved_tmp, count, uci);
-#else
- return find_microcode_patch(mcs->mc_saved, count, uci);
-#endif
- }
+ return find_matching_signature(mc, csig, cpf);
}
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
*/
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
- unsigned long sig)
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+ unsigned long sig)
{
- unsigned int fam, model;
- unsigned int fam_ucode, model_ucode;
- struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
- int ext_sigcount, i;
+ struct extended_sigtable *ext_header;
+ unsigned int fam_ucode, model_ucode;
struct extended_signature *ext_sig;
+ unsigned int fam, model;
+ int ext_sigcount, i;
fam = x86_family(sig);
model = x86_model(sig);
@@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(mc_header->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
- return UCODE_NFOUND;
+ return false;
ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(ext_sig->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
ext_sig++;
}
- return UCODE_NFOUND;
+ return false;
}
-static int
-save_microcode(struct mc_saved_data *mcs,
- struct microcode_intel **mc_saved_src,
- unsigned int num_saved)
+static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
{
- int i, j;
- struct microcode_intel **saved_ptr;
- int ret;
+ struct ucode_patch *p;
- if (!num_saved)
- return -EINVAL;
+ p = kzalloc(sizeof(struct ucode_patch), GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
- /*
- * Copy new microcode data.
- */
- saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
- if (!saved_ptr)
- return -ENOMEM;
-
- for (i = 0; i < num_saved; i++) {
- struct microcode_header_intel *mc_hdr;
- struct microcode_intel *mc;
- unsigned long size;
-
- if (!mc_saved_src[i]) {
- ret = -EINVAL;
- goto err;
- }
+ p->data = kmemdup(data, size, GFP_KERNEL);
+ if (!p->data) {
+ kfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return p;
+}
+
+static void save_microcode_patch(void *data, unsigned int size)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ struct ucode_patch *iter, *tmp, *p;
+ bool prev_found = false;
+ unsigned int sig, pf;
+
+ mc_hdr = (struct microcode_header_intel *)data;
+
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+ mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (find_matching_signature(data, sig, pf)) {
+ prev_found = true;
- mc = mc_saved_src[i];
- mc_hdr = &mc->hdr;
- size = get_totalsize(mc_hdr);
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
- saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
- if (!saved_ptr[i]) {
- ret = -ENOMEM;
- goto err;
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer %p\n", data);
+ else
+ list_replace(&iter->plist, &p->plist);
}
}
/*
- * Point to newly saved microcode.
+ * There weren't any previous patches found in the list cache; save the
+ * newly found.
*/
- mcs->mc_saved = saved_ptr;
- mcs->num_saved = num_saved;
-
- return 0;
-
-err:
- for (j = 0; j <= i; j++)
- kfree(saved_ptr[j]);
- kfree(saved_ptr);
-
- return ret;
+ if (!prev_found) {
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer for %p\n", data);
+ else
+ list_add_tail(&p->plist, &microcode_cache);
+ }
}
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- * patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
- u8 *ucode_ptr, unsigned int num_saved)
+static int microcode_sanity_check(void *mc, int print_err)
{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- unsigned int sig, pf;
- int found = 0, i;
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
- for (i = 0; i < num_saved; i++) {
- mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
- if (!find_matching_signature(ucode_ptr, sig, pf))
- continue;
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format.\n");
+ return -EINVAL;
+ }
- found = 1;
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
/*
- * Found an older ucode saved earlier. Replace it with
- * this newer one.
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
*/
- mc_saved[i] = (struct microcode_intel *)ucode_ptr;
- break;
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
}
- /* Newly detected microcode, save it to memory. */
- if (i >= num_saved && !found)
- mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+ if (!ext_table_size)
+ return 0;
- return num_saved;
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
}
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
*/
-static enum ucode_state __init
-get_matching_model_microcode(unsigned long start, void *data, size_t size,
- struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci)
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
struct microcode_header_intel *mc_header;
- unsigned int num_saved = mcs->num_saved;
- enum ucode_state state = UCODE_OK;
- unsigned int leftover = size;
- u8 *ucode_ptr = data;
+ struct microcode_intel *patch = NULL;
unsigned int mc_size;
- int i;
-
- while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
- if (leftover < sizeof(mc_header))
+ while (size) {
+ if (size < sizeof(struct microcode_header_intel))
break;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
+ mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size || mc_size > leftover ||
- microcode_sanity_check(ucode_ptr, 0) < 0)
+ if (!mc_size ||
+ mc_size > size ||
+ microcode_sanity_check(data, 0) < 0)
break;
- leftover -= mc_size;
+ size -= mc_size;
- /*
- * Since APs with same family and model as the BSP may boot in
- * the platform, we need to find and save microcode patches
- * with the same family and model as the BSP.
- */
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
- ucode_ptr += mc_size;
+ if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ data += mc_size;
continue;
}
- num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
+ if (save) {
+ save_microcode_patch(data, mc_size);
+ goto next;
+ }
- ucode_ptr += mc_size;
- }
- if (leftover) {
- state = UCODE_ERROR;
- return state;
- }
+ if (!patch) {
+ if (!has_newer_microcode(data,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ uci->cpu_sig.rev))
+ goto next;
- if (!num_saved) {
- state = UCODE_NFOUND;
- return state;
- }
+ } else {
+ struct microcode_header_intel *phdr = &patch->hdr;
+
+ if (!has_newer_microcode(data,
+ phdr->sig,
+ phdr->pf,
+ phdr->rev))
+ goto next;
+ }
- for (i = 0; i < num_saved; i++)
- mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+ /* We have a newer patch, save it. */
+ patch = data;
- mcs->num_saved = num_saved;
+next:
+ data += mc_size;
+ }
+
+ if (size)
+ return NULL;
- return state;
+ return patch;
}
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
unsigned int family, model;
- struct cpu_signature csig;
+ struct cpu_signature csig = { 0 };
unsigned int eax, ebx, ecx, edx;
- csig.sig = 0;
- csig.pf = 0;
- csig.rev = 0;
-
memset(uci, 0, sizeof(*uci));
eax = 0x00000001;
@@ -374,23 +382,16 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- family = x86_family(csig.sig);
- model = x86_model(csig.sig);
+ family = x86_family(eax);
+ model = x86_model(eax);
if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
}
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
-
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
- csig.rev = val[1];
+ csig.rev = intel_get_microcode_revision();
uci->cpu_sig = csig;
uci->valid = 1;
@@ -401,40 +402,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
static void show_saved_mc(void)
{
#ifdef DEBUG
- int i, j;
+ int i = 0, j;
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
+ struct ucode_patch *p;
- if (!mc_saved_data.num_saved) {
+ if (list_empty(&microcode_cache)) {
pr_debug("no microcode data saved.\n");
return;
}
- pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
collect_cpu_info_early(&uci);
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
- for (i = 0; i < mc_saved_data.num_saved; i++) {
+ list_for_each_entry(p, &microcode_cache, plist) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
- int ext_sigcount;
struct extended_signature *ext_sig;
+ int ext_sigcount;
- mc_saved_header = (struct microcode_header_intel *)
- mc_saved_data.mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
- date = mc_saved_header->date;
+ mc_saved_header = (struct microcode_header_intel *)p->data;
+
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ date = mc_saved_header->date;
+
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
- i, sig, pf, rev, total_size,
+ i++, sig, pf, rev, total_size,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
@@ -443,7 +445,7 @@ static void show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -456,85 +458,43 @@ static void show_saved_mc(void)
ext_sig++;
}
-
}
#endif
}
/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc, unsigned int size)
{
#ifdef CONFIG_HOTPLUG_CPU
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count_init;
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
- int ret, i;
-
mutex_lock(&x86_cpu_microcode_mutex);
- mc_saved_count_init = mc_saved_data.num_saved;
- num_saved = mc_saved_data.num_saved;
- mc_saved = mc_saved_data.mc_saved;
-
- if (mc_saved && num_saved)
- memcpy(mc_saved_tmp, mc_saved,
- num_saved * sizeof(struct microcode_intel *));
- /*
- * Save the microcode patch mc in mc_save_tmp structure if it's a newer
- * version.
- */
- num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
-
- /*
- * Save the mc_save_tmp in global mc_saved_data.
- */
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
- if (ret) {
- pr_err("Cannot save microcode patch.\n");
- goto out;
- }
-
+ save_microcode_patch(mc, size);
show_saved_mc();
- /*
- * Free old saved microcode data.
- */
- if (mc_saved) {
- for (i = 0; i < mc_saved_count_init; i++)
- kfree(mc_saved[i]);
- kfree(mc_saved);
- }
-
-out:
mutex_unlock(&x86_cpu_microcode_mutex);
#endif
}
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
-#ifdef CONFIG_X86_64
- unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+ unsigned int eax = 1, ebx, ecx = 0, edx;
char name[30];
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
native_cpuid(&eax, &ebx, &ecx, &edx);
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
return get_builtin_firmware(cp, name);
-#else
- return false;
-#endif
}
/*
@@ -570,8 +530,7 @@ void show_ucode_info_early(void)
}
/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
+ * At this point, we can not call printk() yet. Delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
static void print_ucode(struct ucode_cpu_info *uci)
@@ -616,7 +575,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
{
struct microcode_intel *mc;
- unsigned int val[2];
+ u32 rev;
mc = uci->mc;
if (!mc)
@@ -624,21 +583,16 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
/* write microcode via MSR 0x79 */
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
- /* get the current revision from MSR 0x8B */
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- if (val[1] != mc->hdr.rev)
+ rev = intel_get_microcode_revision();
+ if (rev != mc->hdr.rev)
return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
- uci->cpu_sig.rev = val[1];
+ uci->cpu_sig.rev = rev;
if (early)
print_ucode(uci);
@@ -648,206 +602,133 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
return 0;
}
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
- */
int __init save_microcode_in_initrd_intel(void)
{
- struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data.num_saved;
- unsigned long offset = 0;
- int ret;
+ struct ucode_cpu_info uci;
+ struct cpio_data cp;
+
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path, false);
- if (!count)
+ if (!(cp.data && cp.size))
return 0;
- /*
- * We have found a valid initrd but it might've been relocated in the
- * meantime so get its updated address.
- */
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
- offset = initrd_start;
+ collect_cpu_info_early(&uci);
- copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
+ scan_microcode(cp.data, cp.size, &uci, true);
- ret = save_microcode(&mc_saved_data, mc_saved, count);
- if (ret)
- pr_err("Cannot save microcode patches from initrd.\n");
- else
- show_saved_mc();
+ show_saved_mc();
- return ret;
+ return 0;
}
-static __init enum ucode_state
-__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
{
-#ifdef CONFIG_BLK_DEV_INITRD
- static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
- char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
- : ucode_name;
-# ifdef CONFIG_X86_32
- unsigned long start = 0, size;
- struct boot_params *params;
-
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- size = params->hdr.ramdisk_size;
-
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
- start = (size ? params->hdr.ramdisk_image : 0);
-
-# else /* CONFIG_X86_64 */
- unsigned long start = 0, size;
-
- size = (u64)boot_params.ext_ramdisk_size << 32;
- size |= boot_params.hdr.ramdisk_size;
-
- if (size) {
- start = (u64)boot_params.ext_ramdisk_image << 32;
- start |= boot_params.hdr.ramdisk_image;
+ static const char *path;
+ struct cpio_data cp;
+ bool use_pa;
- start += PAGE_OFFSET;
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ path = ucode_path;
+ use_pa = false;
}
-# endif
-
- *cd = find_cpio_data(p, (void *)start, size, NULL);
- if (cd->data) {
- blbp->start = start;
- blbp->valid = true;
- return UCODE_OK;
- } else
-#endif /* CONFIG_BLK_DEV_INITRD */
- return UCODE_ERROR;
-}
+ /* try built-in microcode first */
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(path, use_pa);
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
-{
- struct cpio_data cd = { NULL, 0, "" };
- enum ucode_state ret;
+ if (!(cp.data && cp.size))
+ return NULL;
- /* try built-in microcode first */
- if (load_builtin_intel_microcode(&cd))
- /*
- * Invalidate blobs as we might've gotten an initrd too,
- * supplied by the boot loader, by mistake or simply forgotten
- * there. That's fine, we ignore it since we've found builtin
- * microcode already.
- */
- blbp->valid = false;
- else {
- ret = __scan_microcode_initrd(&cd, blbp);
- if (ret != UCODE_OK)
- return ret;
- }
+ collect_cpu_info_early(uci);
- return get_matching_model_microcode(blbp->start, cd.data, cd.size,
- mcs, mc_ptrs, uci);
+ return scan_microcode(cp.data, cp.size, uci, false);
}
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_blobs *blbp)
+void __init load_ucode_intel_bsp(void)
{
+ struct microcode_intel *patch;
struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- collect_cpu_info_early(&uci);
- ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
- if (ret != UCODE_OK)
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
return;
- ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
- if (ret != UCODE_OK)
- return;
+ uci.mc = patch;
apply_microcode_early(&uci, true);
}
-void __init load_ucode_intel_bsp(void)
+void load_ucode_intel_ap(void)
{
- struct ucode_blobs *blobs_p;
- struct mc_saved_data *mcs;
- unsigned long *ptrs;
+ struct microcode_intel *patch, **iup;
+ struct ucode_cpu_info uci;
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
+ if (IS_ENABLED(CONFIG_X86_32))
+ iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+ else
+ iup = &intel_ucode_patch;
+
+reget:
+ if (!*iup) {
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ *iup = patch;
+ }
+
+ uci.mc = *iup;
- _load_ucode_intel_bsp(mcs, ptrs, blobs_p);
+ if (apply_microcode_early(&uci, true)) {
+ /* Mixed-silicon system? Try to refetch the proper patch: */
+ *iup = NULL;
+
+ goto reget;
+ }
}
-void load_ucode_intel_ap(void)
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
{
- struct ucode_blobs *blobs_p;
- unsigned long *ptrs, start = 0;
- struct mc_saved_data *mcs;
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
+ struct microcode_header_intel *phdr;
+ struct ucode_patch *iter, *tmp;
- /*
- * If there is no valid ucode previously saved in memory, no need to
- * update ucode on this AP.
- */
- if (!mcs->num_saved)
- return;
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- if (blobs_p->valid) {
- start = blobs_p->start;
+ phdr = (struct microcode_header_intel *)iter->data;
- /*
- * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
- * physmem mapping too and there we have the initrd.
- */
- start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
- }
+ if (phdr->rev <= uci->cpu_sig.rev)
+ continue;
- collect_cpu_info_early(&uci);
- ret = load_microcode(mcs, ptrs, start, &uci);
- if (ret != UCODE_OK)
- return;
+ if (!find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
+ continue;
- apply_microcode_early(&uci, true);
+ return iter->data;
+ }
+ return NULL;
}
void reload_ucode_intel(void)
{
+ struct microcode_intel *p;
struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- if (!mc_saved_data.num_saved)
- return;
collect_cpu_info_early(&uci);
- ret = find_microcode_patch(mc_saved_data.mc_saved,
- mc_saved_data.num_saved, &uci);
- if (ret != UCODE_OK)
+ p = find_patch(&uci);
+ if (!p)
return;
+ uci.mc = p;
+
apply_microcode_early(&uci, false);
}
@@ -879,31 +760,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
return 0;
}
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc, int cpu)
-{
- struct cpu_signature cpu_sig;
- unsigned int csig, cpf, crev;
-
- collect_cpu_info(cpu, &cpu_sig);
-
- csig = cpu_sig.sig;
- cpf = cpu_sig.pf;
- crev = cpu_sig.rev;
-
- return has_newer_microcode(mc, csig, cpf, crev);
-}
-
static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc;
struct ucode_cpu_info *uci;
struct cpuinfo_x86 *c;
- unsigned int val[2];
static int prev_rev;
+ u32 rev;
/* We should bind the task to the CPU */
if (WARN_ON(raw_smp_processor_id() != cpu))
@@ -911,46 +774,37 @@ static int apply_microcode_intel(int cpu)
uci = ucode_cpu_info + cpu;
mc = uci->mc;
- if (!mc)
- return 0;
-
- /*
- * Microcode on this CPU could be updated earlier. Only apply the
- * microcode patch in mc when it is newer than the one on this
- * CPU.
- */
- if (!get_matching_mc(mc, cpu))
- return 0;
+ if (!mc) {
+ /* Look for a newer patch in our cache: */
+ mc = find_patch(uci);
+ if (!mc)
+ return 0;
+ }
/* write microcode via MSR 0x79 */
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
- wrmsrl(MSR_IA32_UCODE_REV, 0);
-
- /* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
- /* get the current revision from MSR 0x8B */
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+ rev = intel_get_microcode_revision();
- if (val[1] != mc->hdr.rev) {
+ if (rev != mc->hdr.rev) {
pr_err("CPU%d update to revision 0x%x failed\n",
cpu, mc->hdr.rev);
return -1;
}
- if (val[1] != prev_rev) {
+ if (rev != prev_rev) {
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
- val[1],
+ rev,
mc->hdr.date & 0xffff,
mc->hdr.date >> 24,
(mc->hdr.date >> 16) & 0xff);
- prev_rev = val[1];
+ prev_rev = rev;
}
c = &cpu_data(cpu);
- uci->cpu_sig.rev = val[1];
- c->microcode = val[1];
+ uci->cpu_sig.rev = rev;
+ c->microcode = rev;
return 0;
}
@@ -962,8 +816,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
- unsigned int curr_mc_size = 0;
+ unsigned int curr_mc_size = 0, new_mc_size = 0;
unsigned int csig, cpf;
while (leftover) {
@@ -1004,6 +857,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
+ new_mc_size = mc_size;
mc = NULL; /* trigger new vmalloc */
}
@@ -1015,14 +869,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
if (leftover) {
vfree(new_mc);
- state = UCODE_ERROR;
- goto out;
+ return UCODE_ERROR;
}
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
- }
+ if (!new_mc)
+ return UCODE_NFOUND;
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
@@ -1032,12 +883,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc);
+ save_mc_for_early(new_mc, new_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-out:
- return state;
+
+ return UCODE_OK;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -1081,20 +932,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size)
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
- .microcode_fini_cpu = microcode_fini_cpu,
};
struct microcode_ops * __init init_intel_microcode(void)
@@ -1109,4 +951,3 @@ struct microcode_ops * __init init_intel_microcode(void)
return &microcode_intel_ops;
}
-
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
deleted file mode 100644
index 406cb6c0d9dd..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
-{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return find_matching_signature(mc, csig, cpf);
-}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f44c5a50ab8..b5375b9497b3 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -25,12 +25,12 @@
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
-#include <asm/idle.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/reboot.h>
+#include <asm/nmi.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -133,33 +133,38 @@ static uint32_t __init ms_hyperv_platform(void)
return 0;
}
-static cycle_t read_hv_clock(struct clocksource *arg)
+static unsigned char hv_get_nmi_reason(void)
{
- cycle_t current_tick;
- /*
- * Read the partition counter to get the current tick count. This count
- * is set to 0 when the partition is created and is incremented in
- * 100 nanosecond units.
- */
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
- return current_tick;
+ return 0;
}
-static struct clocksource hyperv_cs = {
- .name = "hyperv_clocksource",
- .rating = 400, /* use this when running on Hyperv*/
- .read = read_hv_clock,
- .mask = CLOCKSOURCE_MASK(64),
- .flags = CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-static unsigned char hv_get_nmi_reason(void)
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
{
- return 0;
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
}
+#endif
static void __init ms_hyperv_init_platform(void)
{
+ int hv_host_info_eax;
+ int hv_host_info_ebx;
+ int hv_host_info_ecx;
+ int hv_host_info_edx;
+
/*
* Extract the features and hints
*/
@@ -170,6 +175,21 @@ static void __init ms_hyperv_init_platform(void)
pr_info("HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
+ /*
+ * Extract host information.
+ */
+ if (cpuid_eax(HVCPUID_VENDOR_MAXFUNCTION) >= HVCPUID_VERSION) {
+ hv_host_info_eax = cpuid_eax(HVCPUID_VERSION);
+ hv_host_info_ebx = cpuid_ebx(HVCPUID_VERSION);
+ hv_host_info_ecx = cpuid_ecx(HVCPUID_VERSION);
+ hv_host_info_edx = cpuid_edx(HVCPUID_VERSION);
+
+ pr_info("Hyper-V Host Build:%d-%d.%d-%d-%d.%d\n",
+ hv_host_info_eax, hv_host_info_ebx >> 16,
+ hv_host_info_ebx & 0xFFFF, hv_host_info_ecx,
+ hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF);
+ }
+
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
/*
@@ -183,10 +203,10 @@ static void __init ms_hyperv_init_platform(void)
pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
}
-#endif
- if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
- clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
+#endif
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
@@ -204,6 +224,13 @@ static void __init ms_hyperv_init_platform(void)
*/
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+#if IS_ENABLED(CONFIG_HYPERV)
+ /*
+ * Setup the hook to get control post apic initialization.
+ */
+ x86_platform.apic_post_init = hyperv_init;
+#endif
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 1db8dc490b66..d9794060fe22 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -17,11 +17,20 @@ struct cpuid_bit {
u32 sub_leaf;
};
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { 0, 0, 0, 0, 0 }
};
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
@@ -30,18 +39,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { 0, 0, 0, 0, 0 }
- };
-
for (cb = cpuid_bits; cb->feature; cb++) {
/* Verify that the level is valid */
@@ -50,10 +47,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
max_level > (cb->level | 0xffff))
continue;
- cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
- &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
}
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+ enum cpuid_regs_idx reg)
+{
+ const struct cpuid_bit *cb;
+ u32 cpuid_val = 0;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ if (level > cb->level)
+ continue;
+
+ if (level < cb->level)
+ break;
+
+ if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+ if (cpu_has(&boot_cpu_data, cb->feature))
+ cpuid_val |= BIT(cb->bit);
+ }
+ }
+
+ return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 34178564be2a..d77d07ab310b 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,4 +1,6 @@
#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5130985b758b..22403a28caf5 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,12 +24,16 @@
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/clocksource.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
+
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
#define VMWARE_HYPERVISOR_PORT 0x5658
@@ -48,6 +52,8 @@
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");
+static unsigned long vmware_tsc_khz __ro_after_init;
+
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -57,35 +63,80 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz, lpj;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
- (unsigned long) tsc_hz / 1000,
- (unsigned long) tsc_hz % 1000);
-
- if (!preset_lpj) {
- lpj = ((u64)tsc_hz * 1000);
- do_div(lpj, HZ);
- preset_lpj = lpj;
- }
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = 0;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
- return tsc_hz;
+static unsigned long long vmware_sched_clock(void)
+{
+ unsigned long long ns;
+
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
}
+static void __init vmware_sched_clock_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pv_time_ops.sched_clock = vmware_sched_clock;
+ pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_cpu_ops.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz && vmw_sched_clock)
+ vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
+ uint64_t lpj, tsc_khz;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
lapic_timer_frequency = ecx / HZ;
@@ -96,6 +147,8 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ vmware_paravirt_ops_setup();
+
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2836de390f95..0931a105ffe1 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -45,10 +45,7 @@
#include <asm/msr.h>
static struct class *cpuid_class;
-
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
+static enum cpuhp_state cpuhp_cpuid_state;
static void cpuid_smp_cpuid(void *cmd_block)
{
@@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = {
.open = cpuid_open,
};
-static int cpuid_device_create(int cpu)
+static int cpuid_device_create(unsigned int cpu)
{
struct device *dev;
@@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void cpuid_device_destroy(int cpu)
+static int cpuid_device_destroy(unsigned int cpu)
{
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
}
-static int cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = cpuid_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- cpuid_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block cpuid_class_cpu_notifier =
-{
- .notifier_call = cpuid_class_cpu_callback,
-};
-
static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
@@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
static int __init cpuid_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
"cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
cpuid_class = class_create(THIS_MODULE, "cpuid");
if (IS_ERR(cpuid_class)) {
@@ -177,45 +149,28 @@ static int __init cpuid_init(void)
}
cpuid_class->devnode = cpuid_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = cpuid_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
- err = 0;
- goto out;
+ cpuhp_cpuid_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i) {
- cpuid_device_destroy(i);
- }
- cpu_notifier_register_done();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-out:
return err;
}
+module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- cpuid_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_cpuid_state);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 650830e39e3a..3741461c63a0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg)
int crash_load_segments(struct kimage *image)
{
- unsigned long src_start, src_sz, elf_sz;
- void *elf_addr;
int ret;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
/*
* Determine and load a segment for backup area. First 640K RAM
@@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image)
if (ret < 0)
return ret;
- src_start = image->arch.backup_src_start;
- src_sz = image->arch.backup_src_sz;
-
/* Add backup segment. */
- if (src_sz) {
+ if (image->arch.backup_src_sz) {
+ kbuf.buffer = &crash_zero_bytes;
+ kbuf.bufsz = sizeof(crash_zero_bytes);
+ kbuf.memsz = image->arch.backup_src_sz;
+ kbuf.buf_align = PAGE_SIZE;
/*
* Ideally there is no source for backup segment. This is
* copied in purgatory after crash. Just add a zero filled
* segment for now to make sure checksum logic works fine.
*/
- ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
- sizeof(crash_zero_bytes), src_sz,
- PAGE_SIZE, 0, -1, 0,
- &image->arch.backup_load_addr);
+ ret = kexec_add_buffer(&kbuf);
if (ret)
return ret;
+ image->arch.backup_load_addr = kbuf.mem;
pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
- image->arch.backup_load_addr, src_start, src_sz);
+ image->arch.backup_load_addr,
+ image->arch.backup_src_start, kbuf.memsz);
}
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
if (ret)
return ret;
- image->arch.elf_headers = elf_addr;
- image->arch.elf_headers_sz = elf_sz;
+ image->arch.elf_headers = kbuf.buffer;
+ image->arch.elf_headers_sz = kbuf.bufsz;
- ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
- ELF_CORE_HEADER_ALIGN, 0, -1, 0,
- &image->arch.elf_load_addr);
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ ret = kexec_add_buffer(&kbuf);
if (ret) {
vfree((void *)image->arch.elf_headers);
return ret;
}
+ image->arch.elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_load_addr, elf_sz, elf_sz);
+ image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
return ret;
}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 11891ca7b716..538fedea9b3f 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -10,7 +10,7 @@
#include <linux/highmem.h>
#include <linux/crash_dump.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
static void *kdump_buf_page;
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index f6dfd9334b67..f9c324e08d85 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,9 +1,10 @@
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/debug.h>
#include <linux/init_task.h>
#include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 85f854b98a9d..09d4ac0d2661 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,8 @@
#include <linux/kdebug.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/ftrace.h>
#include <linux/kexec.h>
#include <linux/bug.h>
@@ -22,7 +24,6 @@
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -46,14 +47,7 @@ static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
touch_nmi_watchdog();
- printk("%s [<%p>] %s%pB\n",
- log_lvl, (void *)address, reliable ? "" : "? ",
- (void *)address);
-}
-
-void printk_address(unsigned long address)
-{
- pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -67,6 +61,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ? : get_stack_pointer(task, regs);
/*
* Iterate through the stacks, starting with the current stack pointer.
@@ -82,8 +77,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - softirq stack
* - hardirq stack
*/
- for (; stack; stack = stack_info.next_sp) {
- const char *str_begin, *str_end;
+ for (regs = NULL; stack; stack = stack_info.next_sp) {
+ const char *stack_name;
/*
* If we overflowed the task stack into a guard page, jump back
@@ -95,9 +90,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
- stack_type_str(stack_info.type, &str_begin, &str_end);
- if (str_begin)
- printk("%s <%s> ", log_lvl, str_begin);
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,6 +114,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (!__kernel_text_address(addr))
continue;
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by __show_regs() below.
+ */
+ if (regs && stack == &regs->ip) {
+ unwind_next_frame(&state);
+ continue;
+ }
+
if (stack == ret_addr_p)
reliable = 1;
@@ -146,10 +150,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* of the addresses will just be printed as unreliable.
*/
unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state);
+ if (regs)
+ __show_regs(regs, 0);
}
- if (str_end)
- printk("%s <%s> ", log_lvl, str_end);
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
}
}
@@ -164,12 +173,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
if (!sp && task == current)
sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, "");
+ show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
}
void show_stack_regs(struct pt_regs *regs)
{
- show_stack_log_lvl(current, regs, NULL, "");
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -261,14 +270,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
}
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
+ (void *)regs->ip, ss, sp);
#else
/* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip);
- printk(" RSP <%016lx>\n", regs->sp);
+ printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
#endif
return 0;
}
@@ -291,22 +297,6 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
-static int __init kstack_setup(char *s)
-{
- ssize_t ret;
- unsigned long val;
-
- if (!s)
- return -EINVAL;
-
- ret = kstrtoul(s, 0, &val);
- if (ret)
- return ret;
- kstack_depth_to_print = val;
- return 0;
-}
-early_param("kstack", kstack_setup);
-
static int __init code_bytes_setup(char *s)
{
ssize_t ret;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 06eb322b5f9f..b0b3a3df7c20 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -2,6 +2,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
@@ -16,18 +17,15 @@
#include <asm/stacktrace.h>
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
- switch (type) {
- case STACK_TYPE_IRQ:
- case STACK_TYPE_SOFTIRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ return NULL;
}
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
@@ -109,8 +107,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -121,36 +121,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %08lx", log_lvl, *stack++);
- } else
- pr_cont(" %08lx", *stack++);
- touch_nmi_watchdog();
- }
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -168,8 +138,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- pr_emerg("Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
+ show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 36cf1a498227..a8b117e93b46 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -2,6 +2,7 @@
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
*/
+#include <linux/sched/debug.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
@@ -28,23 +29,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- switch (type) {
- case STACK_TYPE_IRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
- *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
- *end = "EOE";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+ return NULL;
}
static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
@@ -128,8 +123,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -140,56 +137,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *irq_stack_end;
- unsigned long *irq_stack;
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
- irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- unsigned long word;
-
- if (stack >= irq_stack && stack <= irq_stack_end) {
- if (stack == irq_stack_end) {
- stack = (unsigned long *) (irq_stack_end[-1]);
- pr_cont(" <EOI> ");
- }
- } else {
- if (kstack_end(stack))
- break;
- }
-
- if (probe_kernel_address(stack, word))
- break;
-
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %016lx", log_lvl, word);
- } else
- pr_cont(" %016lx", word);
-
- stack++;
- touch_nmi_watchdog();
- }
-
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -207,8 +154,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 90e8dde3ec26..b2bbad6ebe4d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -580,24 +580,19 @@ static void __init update_e820_saved(void)
}
#define MAX_GAP_END 0x100000000ull
/*
- * Search for a gap in the e820 memory space from start_addr to end_addr.
+ * Search for a gap in the e820 memory space from 0 to MAX_GAP_END.
*/
-__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
- unsigned long start_addr, unsigned long long end_addr)
+static int __init e820_search_gap(unsigned long *gapstart,
+ unsigned long *gapsize)
{
- unsigned long long last;
+ unsigned long long last = MAX_GAP_END;
int i = e820->nr_map;
int found = 0;
- last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
-
while (--i >= 0) {
unsigned long long start = e820->map[i].addr;
unsigned long long end = start + e820->map[i].size;
- if (end < start_addr)
- continue;
-
/*
* Since "last" is at most 4GB, we know we'll
* fit in 32 bits if this condition is true
@@ -628,18 +623,19 @@ __init void e820_setup_gap(void)
unsigned long gapstart, gapsize;
int found;
- gapstart = 0x10000000;
gapsize = 0x400000;
- found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END);
+ found = e820_search_gap(&gapstart, &gapsize);
-#ifdef CONFIG_X86_64
if (!found) {
+#ifdef CONFIG_X86_64
gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR
"e820: cannot find a gap in the 32bit address range\n"
"e820: PCI devices with unassigned 32bit BARs may break!\n");
- }
+#else
+ gapstart = 0x10000000;
#endif
+ }
/*
* e820_reserve_resources_late protect stolen RAM already
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index aad34aafc0e0..d913047f832c 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -23,17 +23,12 @@ static double __initdata y = 3145727.0;
*/
void __init fpu__init_check_bugs(void)
{
- u32 cr0_saved;
s32 fdiv_bug;
/* kernel_fpu_begin/end() relies on patched alternative instructions. */
if (!boot_cpu_has(X86_FEATURE_FPU))
return;
- /* We might have CR0::TS set already, clear it: */
- cr0_saved = read_cr0();
- write_cr0(cr0_saved & ~X86_CR0_TS);
-
kernel_fpu_begin();
/*
@@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void)
kernel_fpu_end();
- write_cr0(cr0_saved);
-
if (fdiv_bug) {
set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
pr_warn("Hmm, FPU with FDIV bug\n");
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ebb4e95fbd74..e1114f070c2d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void)
return this_cpu_read(in_kernel_fpu);
}
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
static bool interrupted_kernel_fpu_idle(void)
{
- if (kernel_fpu_disabled())
- return false;
-
- if (use_eager_fpu())
- return true;
-
- return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+ return !kernel_fpu_disabled();
}
/*
@@ -125,8 +107,7 @@ void __kernel_fpu_begin(void)
*/
copy_fpregs_to_fpstate(fpu);
} else {
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- __fpregs_activate_hw();
+ __cpu_invalidate_fpregs_state();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -137,8 +118,6 @@ void __kernel_fpu_end(void)
if (fpu->fpregs_active)
copy_kernel_to_fpregs(&fpu->state);
- else
- __fpregs_deactivate_hw();
kernel_fpu_enable();
}
@@ -159,35 +138,6 @@ void kernel_fpu_end(void)
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
- * CR0::TS save/restore functions:
- */
-int irq_ts_save(void)
-{
- /*
- * If in process context and not atomic, we can take a spurious DNA fault.
- * Otherwise, doing clts() in process context requires disabling preemption
- * or some heavy lifting like kernel_fpu_begin()
- */
- if (!in_atomic())
- return 0;
-
- if (read_cr0() & X86_CR0_TS) {
- clts();
- return 1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_ts_save);
-
-void irq_ts_restore(int TS_state)
-{
- if (TS_state)
- stts();
-}
-EXPORT_SYMBOL_GPL(irq_ts_restore);
-
-/*
* Save the FPU state (mark it for reload if necessary):
*
* This only ever gets called for the current task.
@@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_before_save(fpu);
if (fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(fpu)) {
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&fpu->state);
- else
- fpregs_deactivate(fpu);
+ copy_kernel_to_fpregs(&fpu->state);
}
}
trace_x86_fpu_after_save(fpu);
@@ -231,13 +178,8 @@ void fpstate_init(union fpregs_state *state)
memset(state, 0, fpu_kernel_xstate_size);
- /*
- * XRSTORS requires that this bit is set in xcomp_bv, or
- * it will #GP. Make sure it is replaced after the memset().
- */
if (static_cpu_has(X86_FEATURE_XSAVES))
- state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT;
-
+ fpstate_init_xstate(&state->xsave);
if (static_cpu_has(X86_FEATURE_FXSR))
fpstate_init_fxstate(&state->fxsave);
else
@@ -247,7 +189,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
- dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
@@ -260,8 +201,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
- if (use_eager_fpu())
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* Save current FPU registers directly into the child
@@ -283,10 +223,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
memcpy(&src_fpu->state, &dst_fpu->state,
fpu_kernel_xstate_size);
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&src_fpu->state);
- else
- fpregs_deactivate(src_fpu);
+ copy_kernel_to_fpregs(&src_fpu->state);
}
preempt_enable();
@@ -366,7 +303,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
if (fpu->fpstate_active) {
/* Invalidate any lazy state: */
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
@@ -409,7 +346,7 @@ void fpu__current_fpstate_write_begin(void)
* ensures we will not be lazy and skip a XRSTOR in the
* future.
*/
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
}
/*
@@ -459,7 +396,6 @@ void fpu__restore(struct fpu *fpu)
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
copy_kernel_to_fpregs(&fpu->state);
- fpu->counter++;
trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
@@ -477,7 +413,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
void fpu__drop(struct fpu *fpu)
{
preempt_disable();
- fpu->counter = 0;
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7ccb85..c2f8dde3255c 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -7,21 +7,10 @@
#include <asm/cmdline.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/init.h>
/*
- * Initialize the TS bit in CR0 according to the style of context-switches
- * we are using:
- */
-static void fpu__init_cpu_ctx_switch(void)
-{
- if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
- stts();
- else
- clts();
-}
-
-/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
static void fpu__init_cpu_generic(void)
@@ -58,16 +47,9 @@ void fpu__init_cpu(void)
{
fpu__init_cpu_generic();
fpu__init_cpu_xstate();
- fpu__init_cpu_ctx_switch();
}
-/*
- * The earliest FPU detection code.
- *
- * Set the X86_FEATURE_FPU CPU-capability bit based on
- * trying to execute an actual sequence of FPU instructions:
- */
-static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+static bool fpu__probe_without_cpuid(void)
{
unsigned long cr0;
u16 fsw, fcw;
@@ -78,18 +60,25 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
write_cr0(cr0);
- if (!test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
- asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
- : "+m" (fsw), "+m" (fcw));
+ asm volatile("fninit ; fnstsw %0 ; fnstcw %1" : "+m" (fsw), "+m" (fcw));
+
+ pr_info("x86/fpu: Probing for FPU: FSW=0x%04hx FCW=0x%04hx\n", fsw, fcw);
- if (fsw == 0 && (fcw & 0x103f) == 0x003f)
- set_cpu_cap(c, X86_FEATURE_FPU);
+ return fsw == 0 && (fcw & 0x103f) == 0x003f;
+}
+
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+ if (!boot_cpu_has(X86_FEATURE_CPUID) &&
+ !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) {
+ if (fpu__probe_without_cpuid())
+ setup_force_cpu_cap(X86_FEATURE_FPU);
else
- clear_cpu_cap(c, X86_FEATURE_FPU);
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
}
#ifndef CONFIG_MATH_EMULATION
- if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_FPU)) {
pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
for (;;)
asm volatile("hlt");
@@ -233,82 +222,16 @@ static void __init fpu__init_system_xstate_size_legacy(void)
}
/*
- * FPU context switching strategies:
- *
- * Against popular belief, we don't do lazy FPU saves, due to the
- * task migration complications it brings on SMP - we only do
- * lazy FPU restores.
- *
- * 'lazy' is the traditional strategy, which is based on setting
- * CR0::TS to 1 during context-switch (instead of doing a full
- * restore of the FPU state), which causes the first FPU instruction
- * after the context switch (whenever it is executed) to fault - at
- * which point we lazily restore the FPU state into FPU registers.
- *
- * Tasks are of course under no obligation to execute FPU instructions,
- * so it can easily happen that another context-switch occurs without
- * a single FPU instruction being executed. If we eventually switch
- * back to the original task (that still owns the FPU) then we have
- * not only saved the restores along the way, but we also have the
- * FPU ready to be used for the original task.
- *
- * 'lazy' is deprecated because it's almost never a performance win
- * and it's much more complicated than 'eager'.
- *
- * 'eager' switching is by default on all CPUs, there we switch the FPU
- * state during every context switch, regardless of whether the task
- * has used FPU instructions in that time slice or not. This is done
- * because modern FPU context saving instructions are able to optimize
- * state saving and restoration in hardware: they can detect both
- * unused and untouched FPU state and optimize accordingly.
- *
- * [ Note that even in 'lazy' mode we might optimize context switches
- * to use 'eager' restores, if we detect that a task is using the FPU
- * frequently. See the fpu->counter logic in fpu/internal.h for that. ]
- */
-static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
-
-/*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- /* Support all xfeatures known to us */
- if (eagerfpu != DISABLE)
- return XCNTXT_MASK;
-
- /* Warning of xfeatures being disabled for no eagerfpu mode */
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- }
-
- /* Return a mask that masks out all features requiring eagerfpu mode */
- return ~XFEATURE_MASK_EAGER;
+ return XCNTXT_MASK;
}
-/*
- * Disable features dependent on eagerfpu.
- */
-static void __init fpu__clear_eager_fpu_features(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_MPX);
-}
-
-/*
- * Pick the FPU context switching strategy:
- *
- * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
- * the following is true:
- *
- * (1) the cpu has xsaveopt, as it has the optimization and doing eager
- * FPU switching has a relatively low cost compared to a plain xsave;
- * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
- * switching. Should the kernel boot with noxsaveopt, we support MPX
- * with eager FPU switching at a higher cost.
- */
+/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
static bool on_boot_cpu __initdata = 1;
@@ -317,17 +240,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
-
- if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
- eagerfpu = ENABLE;
-
- if (xfeatures_mask & XFEATURE_MASK_EAGER)
- eagerfpu = ENABLE;
-
- if (eagerfpu == ENABLE)
- setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
- printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
}
/*
@@ -336,11 +248,6 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
- eagerfpu = DISABLE;
- fpu__clear_eager_fpu_features();
- }
-
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -375,14 +282,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
*/
fpu__init_cpu();
- /*
- * But don't leave CR0::TS set yet, as some of the FPU setup
- * methods depend on being able to execute FPU instructions
- * that will fault on a set TS, such as the FXSAVE in
- * fpu__init_system_mxcsr().
- */
- clts();
-
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index c114b132d121..b188b16841e3 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -5,6 +5,7 @@
#include <asm/fpu/signal.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/xstate.h>
+#include <linux/sched/task_stack.h>
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a184c210efba..83c23c230b4c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
fpu->fpstate_active = 1;
- if (use_eager_fpu()) {
- preempt_disable();
- fpu__restore(fpu);
- preempt_enable();
- }
+ preempt_disable();
+ fpu__restore(fpu);
+ preempt_enable();
return err;
} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 095ef7ddd6ae..c24ac1efb12d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -73,9 +74,11 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
@@ -703,8 +706,14 @@ void __init fpu__init_system_xstate(void)
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
+ if (!boot_cpu_has(X86_FEATURE_FPU)) {
+ pr_info("x86/fpu: No FPU detected\n");
+ return;
+ }
+
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
- pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+ pr_info("x86/fpu: x87 FPU will use %s\n",
+ boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE");
return;
}
@@ -890,15 +899,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
*/
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
- /*
- * For most XSAVE components, this would be an arduous task:
- * brining fpstate up to date with fpregs, updating fpstate,
- * then re-populating fpregs. But, for components that are
- * never lazily managed, we can just access the fpregs
- * directly. PKRU is never managed lazily, so we can just
- * manipulate it directly. Make sure it stays that way.
- */
- WARN_ON_ONCE(!use_eager_fpu());
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8639bb2ae058..cbd73eb42170 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -29,6 +29,12 @@
#include <asm/ftrace.h>
#include <asm/nops.h>
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && \
+ !defined(CC_USING_FENTRY) && \
+ !defined(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE)
+# error The following combination is not supported: ((compiler missing -mfentry) || (CONFIG_X86_32 and !CONFIG_DYNAMIC_FTRACE)) && CONFIG_FUNCTION_GRAPH_TRACER && CONFIG_CC_OPTIMIZE_FOR_SIZE
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
int ftrace_arch_code_modify_prepare(void)
@@ -535,7 +541,7 @@ static void run_sync(void)
{
int enable_irqs = irqs_disabled();
- /* We may be called with interrupts disbled (on bootup). */
+ /* We may be called with interrupts disabled (on bootup). */
if (enable_irqs)
local_irq_enable();
on_each_cpu(do_sync_core, NULL, 1);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index f16c55bfc090..e5fb436a6548 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -49,3 +49,65 @@ asmlinkage __visible void __init i386_start_kernel(void)
start_kernel();
}
+
+/*
+ * Initialize page tables. This creates a PDE and a set of page
+ * tables, which are located immediately beyond __brk_base. The variable
+ * _brk_end is set up to point to the first "safe" location.
+ * Mappings are created both at virtual address 0 (identity mapping)
+ * and PAGE_OFFSET for up to _end.
+ *
+ * In PAE mode initial_page_table is statically defined to contain
+ * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
+ * entries). The identity mapping is handled by pointing two PGD entries
+ * to the first kernel PMD. Note the upper half of each PMD or PTE are
+ * always zero at this stage.
+ */
+void __init mk_early_pgtbl_32(void)
+{
+#ifdef __pa
+#undef __pa
+#endif
+#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET)
+ pte_t pte, *ptep;
+ int i;
+ unsigned long *ptr;
+ /* Enough space to fit pagetables for the low memory linear map */
+ const unsigned long limit = __pa(_end) +
+ (PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT);
+#ifdef CONFIG_X86_PAE
+ pmd_t pl2, *pl2p = (pmd_t *)__pa(initial_pg_pmd);
+#define SET_PL2(pl2, val) { (pl2).pmd = (val); }
+#else
+ pgd_t pl2, *pl2p = (pgd_t *)__pa(initial_page_table);
+#define SET_PL2(pl2, val) { (pl2).pgd = (val); }
+#endif
+
+ ptep = (pte_t *)__pa(__brk_base);
+ pte.pte = PTE_IDENT_ATTR;
+
+ while ((pte.pte & PTE_PFN_MASK) < limit) {
+
+ SET_PL2(pl2, (unsigned long)ptep | PDE_IDENT_ATTR);
+ *pl2p = pl2;
+#ifndef CONFIG_X86_PAE
+ /* Kernel PDE entry */
+ *(pl2p + ((PAGE_OFFSET >> PGDIR_SHIFT))) = pl2;
+#endif
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ *ptep = pte;
+ pte.pte += PAGE_SIZE;
+ ptep++;
+ }
+
+ pl2p++;
+ }
+
+ ptr = (unsigned long *)__pa(&max_pfn_mapped);
+ /* Can't use pte_pfn() since it's a call with CONFIG_PARAVIRT */
+ *ptr = (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+
+ ptr = (unsigned long *)__pa(&_brk_end);
+ *ptr = (unsigned long)ptep + PAGE_OFFSET;
+}
+
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 54a2372f5dbb..b5785c197e53 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -4,6 +4,7 @@
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
*/
+#define DISABLE_BRANCH_PROFILING
#include <linux/init.h>
#include <linux/linkage.h>
#include <linux/types.h>
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 2dabea46f039..1f85ee8f9439 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -24,6 +24,7 @@
#include <asm/nops.h>
#include <asm/bootparam.h>
#include <asm/export.h>
+#include <asm/pgtable_32.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -41,40 +42,8 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-/*
- * This is how much memory in addition to the memory covered up to
- * and including _end we need mapped initially.
- * We need:
- * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
- * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- *
- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
- * and small than max_low_pfn, otherwise will waste some page table entries
- */
-#if PTRS_PER_PMD > 1
-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD)
-#else
-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
-#endif
-
-/*
- * Number of possible pages in the lowmem region.
- *
- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
- * gas warning about overflowing shift count when gas has been compiled
- * with only a host target support using a 32-bit type for internal
- * representation.
- */
-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
-
-/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
+#define SIZEOF_PTREGS 17*4
/*
* Worst-case size of the kernel mapping we need to make:
@@ -158,109 +127,34 @@ ENTRY(startup_32)
call load_ucode_bsp
#endif
-/*
- * Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond __brk_base. The variable
- * _brk_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end.
- */
-#ifdef CONFIG_X86_PAE
-
- /*
- * In PAE mode initial_page_table is statically defined to contain
- * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
- * entries). The identity mapping is handled by pointing two PGD entries
- * to the first kernel PMD.
- *
- * Note the upper half of each PMD or PTE are always zero at this stage.
- */
-
-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
-
- xorl %ebx,%ebx /* %ebx is kept at zero */
-
- movl $pa(__brk_base), %edi
- movl $pa(initial_pg_pmd), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
- movl %ecx,(%edx) /* Store PMD entry */
- /* Upper half already zero */
- addl $8,%edx
- movl $512,%ecx
-11:
- stosl
- xchgl %eax,%ebx
- stosl
- xchgl %eax,%ebx
- addl $0x1000,%eax
- loop 11b
-
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
-1:
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
+ /* Create early pagetables. */
+ call mk_early_pgtbl_32
/* Do early initialization of the fixmap area */
movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#ifdef CONFIG_X86_PAE
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
-#else /* Not PAE */
-
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
- movl $pa(__brk_base), %edi
- movl $pa(initial_page_table), %edx
- movl $PTE_IDENT_ATTR, %eax
-10:
- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
- movl %ecx,(%edx) /* Store identity PDE entry */
- movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
- addl $4,%edx
- movl $1024, %ecx
-11:
- stosl
- addl $0x1000,%eax
- loop 11b
- /*
- * End condition: we must map up to the end + MAPPING_BEYOND_END.
- */
- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
- cmpl %ebp,%eax
- jb 10b
- addl $__PAGE_OFFSET, %edi
- movl %edi, pa(_brk_end)
- shrl $12, %eax
- movl %eax, pa(max_pfn_mapped)
-
- /* Do early initialization of the fixmap area */
- movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+#else
movl %eax,pa(initial_page_table+0xffc)
#endif
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
+ jb .Ldefault_entry
/* Paravirt-compatible boot parameters. Look to see what architecture
we're booting under. */
movl pa(boot_params + BP_hardware_subarch), %eax
cmpl $num_subarch_entries, %eax
- jae bad_subarch
+ jae .Lbad_subarch
movl pa(subarch_entries)(,%eax,4), %eax
subl $__PAGE_OFFSET, %eax
jmp *%eax
-bad_subarch:
+.Lbad_subarch:
WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
@@ -270,14 +164,14 @@ WEAK(xen_entry)
__INITDATA
subarch_entries:
- .long default_entry /* normal x86/PC */
+ .long .Ldefault_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
- .long default_entry /* Moorestown MID */
+ .long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
.previous
#else
- jmp default_entry
+ jmp .Ldefault_entry
#endif /* CONFIG_PARAVIRT */
#ifdef CONFIG_HOTPLUG_CPU
@@ -289,7 +183,8 @@ num_subarch_entries = (. - subarch_entries) / 4
ENTRY(start_cpu0)
movl initial_stack, %ecx
movl %ecx, %esp
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
ENDPROC(start_cpu0)
#endif
@@ -317,7 +212,7 @@ ENTRY(startup_32_smp)
call load_ucode_ap
#endif
-default_entry:
+.Ldefault_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
@@ -347,7 +242,7 @@ default_entry:
pushfl
popl %eax # get EFLAGS
testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
- jz enable_paging # hw disallowed setting of ID bit
+ jz .Lenable_paging # hw disallowed setting of ID bit
# which means no CPUID and no CR4
xorl %eax,%eax
@@ -357,13 +252,13 @@ default_entry:
movl $1,%eax
cpuid
andl $~1,%edx # Ignore CPUID.FPU
- jz enable_paging # No flags or only CPUID.FPU = no CR4
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
- jz enable_paging
+ jz .Lenable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
@@ -371,7 +266,7 @@ default_entry:
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
- ja enable_paging
+ ja .Lenable_paging
/* Clear bogus XD_DISABLE bits */
call verify_cpu
@@ -380,7 +275,7 @@ default_entry:
cpuid
/* Execute Disable bit supported? */
btl $(X86_FEATURE_NX & 31), %edx
- jnc enable_paging
+ jnc .Lenable_paging
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
@@ -390,7 +285,7 @@ default_entry:
/* Make changes effective */
wrmsr
-enable_paging:
+.Lenable_paging:
/*
* Enable paging
@@ -419,7 +314,7 @@ enable_paging:
*/
movb $4,X86 # at least 486
cmpl $-1,X86_CPUID
- je is486
+ je .Lis486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
@@ -430,7 +325,7 @@ enable_paging:
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
- je is486
+ je .Lis486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
@@ -444,7 +339,7 @@ enable_paging:
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
-is486:
+.Lis486:
movl $0x50022,%ecx # set AM, WP, NE and MP
movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
@@ -470,8 +365,9 @@ is486:
xorl %eax,%eax # Clear LDT
lldt %ax
- pushl $0 # fake return address for unwinder
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(startup_32_smp)
#include "verify_cpu.S"
@@ -662,6 +558,7 @@ ENTRY(setup_once_ref)
__PAGE_ALIGNED_BSS
.align PAGE_SIZE
#ifdef CONFIG_X86_PAE
+.globl initial_pg_pmd
initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
@@ -709,7 +606,12 @@ ENTRY(initial_page_table)
.data
.balign 4
ENTRY(initial_stack)
- .long init_thread_union+THREAD_SIZE
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+ TOP_OF_KERNEL_STACK_PADDING;
__INITRODATA
int_msg:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b4421cc191b0..b467b14b03eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,13 +66,8 @@ startup_64:
* tables and then reload them.
*/
- /*
- * Setup stack for verify_cpu(). "-8" because initial_stack is defined
- * this way, see below. Our best guess is a NULL ptr for stack
- * termination heuristics and we don't want to break anything which
- * might depend on it (kgdb, ...).
- */
- leaq (__end_init_task - 8)(%rip), %rsp
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
/* Sanitize CPU configuration */
call verify_cpu
@@ -117,20 +112,20 @@ startup_64:
movq %rdi, %rax
shrq $PGDIR_SHIFT, %rax
- leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
movq %rdx, 0(%rbx,%rax,8)
movq %rdx, 8(%rbx,%rax,8)
- addq $4096, %rdx
+ addq $PAGE_SIZE, %rdx
movq %rdi, %rax
shrq $PUD_SHIFT, %rax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
incl %eax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- addq $8192, %rbx
+ addq $PAGE_SIZE * 2, %rbx
movq %rdi, %rax
shrq $PMD_SHIFT, %rdi
addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
@@ -147,6 +142,9 @@ startup_64:
decl %ecx
jnz 1b
+ test %rbp, %rbp
+ jz .Lskip_fixup
+
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
@@ -154,9 +152,9 @@ startup_64:
* beyond _end.
*/
leaq level2_kernel_pgt(%rip), %rdi
- leaq 4096(%rdi), %r8
+ leaq PAGE_SIZE(%rdi), %r8
/* See if it is a valid page table entry */
-1: testb $1, 0(%rdi)
+1: testb $_PAGE_PRESENT, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
@@ -167,6 +165,7 @@ startup_64:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
+.Lskip_fixup:
movq $(early_level4_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
@@ -265,13 +264,17 @@ ENTRY(secondary_startup_64)
movl $MSR_GS_BASE,%ecx
movl initial_gs(%rip),%eax
movl initial_gs+4(%rip),%edx
- wrmsr
+ wrmsr
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
-
- /* Finally jump to run C code and to be on real kernel address
+ jmp start_cpu
+ENDPROC(secondary_startup_64)
+
+ENTRY(start_cpu)
+ /*
+ * Jump to run C code and to be on a real kernel address.
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
@@ -295,12 +298,14 @@ ENTRY(secondary_startup_64)
* REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
* address given in m16:64.
*/
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
+ pushq $.Lafter_lret # put return address on stack for unwinder
+ xorq %rbp, %rbp # clear frame pointer
+ movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
-ENDPROC(secondary_startup_64)
+.Lafter_lret:
+ENDPROC(start_cpu)
#include "verify_cpu.S"
@@ -308,15 +313,11 @@ ENDPROC(secondary_startup_64)
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
* up already except stack. We just set up stack here. Then call
- * start_secondary().
+ * start_secondary() via start_cpu().
*/
ENTRY(start_cpu0)
- movq initial_stack(%rip),%rsp
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ movq initial_stack(%rip), %rsp
+ jmp start_cpu
ENDPROC(start_cpu0)
#endif
@@ -328,7 +329,11 @@ ENDPROC(start_cpu0)
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
GLOBAL(initial_stack)
- .quad init_thread_union+THREAD_SIZE-8
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
bad_address:
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 274fab99169d..89ff7af2de50 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -352,8 +352,9 @@ static int hpet_resume(struct clock_event_device *evt, int timer)
} else {
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+ irq_domain_deactivate_irq(irq_get_irq_data(hdev->irq));
irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_irq(hdev->irq);
+ disable_hardirq(hdev->irq);
irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
enable_irq(hdev->irq);
}
@@ -791,7 +792,7 @@ static union hpet_lock hpet __cacheline_aligned = {
{ .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
};
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
{
unsigned long flags;
union hpet_lock old, new;
@@ -802,7 +803,7 @@ static cycle_t read_hpet(struct clocksource *cs)
* Read HPET directly if in NMI.
*/
if (in_nmi())
- return (cycle_t)hpet_readl(HPET_COUNTER);
+ return (u64)hpet_readl(HPET_COUNTER);
/*
* Read the current state of the lock and HPET value atomically.
@@ -821,7 +822,7 @@ static cycle_t read_hpet(struct clocksource *cs)
WRITE_ONCE(hpet.value, new.value);
arch_spin_unlock(&hpet.lock);
local_irq_restore(flags);
- return (cycle_t)new.value;
+ return (u64)new.value;
}
local_irq_restore(flags);
@@ -843,15 +844,15 @@ contended:
new.lockval = READ_ONCE(hpet.lockval);
} while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
- return (cycle_t)new.value;
+ return (u64)new.value;
}
#else
/*
* For UP or 32-bit.
*/
-static cycle_t read_hpet(struct clocksource *cs)
+static u64 read_hpet(struct clocksource *cs)
{
- return (cycle_t)hpet_readl(HPET_COUNTER);
+ return (u64)hpet_readl(HPET_COUNTER);
}
#endif
@@ -867,7 +868,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
u64 start, now;
- cycle_t t1;
+ u64 t1;
/* Start the counter */
hpet_restart_counter();
@@ -1051,11 +1052,11 @@ static __init int hpet_late_init(void)
return 0;
/* This notifier should be called after workqueue is ready */
- ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE",
+ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online",
hpet_cpuhp_online, NULL);
if (ret)
return ret;
- ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL,
+ ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "x86/hpet:dead", NULL,
hpet_cpuhp_dead);
if (ret)
goto err_cpuhp;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b3193f102..9c3cf0944bce 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/errno.h>
@@ -16,6 +17,7 @@
#include <linux/syscalls.h>
#include <linux/bitmap.h>
#include <asm/syscalls.h>
+#include <asm/desc.h>
/*
* this changes the io permissions bitmap in the current task.
@@ -45,6 +47,16 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
+
+ /*
+ * Now that we have an IO bitmap, we need our TSS limit to be
+ * correct. It's fine if we are preempted after doing this:
+ * with TIF_IO_BITMAP set, context switches will keep our TSS
+ * limit correct.
+ */
+ preempt_disable();
+ refresh_tss_limit();
+ preempt_enable();
}
/*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9f669fdd2010..4d8183b5f113 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -14,7 +14,6 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
@@ -265,7 +264,7 @@ void __smp_x86_platform_ipi(void)
x86_platform_ipi_callback();
}
-__visible void smp_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -316,7 +315,7 @@ __visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
}
#endif
-__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_x86_platform_ipi(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 9ebd0b0e73d9..3be74fbdeff2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -15,8 +15,8 @@
#include <linux/ftrace.h>
#include <linux/uaccess.h>
#include <linux/smp.h>
+#include <linux/sched/task_stack.h>
#include <asm/io_apic.h>
-#include <asm/idle.h>
#include <asm/apic.h>
int sysctl_panic_on_stackoverflow;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 3512ba607361..275487872be2 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -9,6 +9,7 @@
#include <linux/hardirq.h>
#include <asm/apic.h>
#include <asm/trace/irq_vectors.h>
+#include <linux/interrupt.h>
static inline void __smp_irq_work_interrupt(void)
{
@@ -16,14 +17,14 @@ static inline void __smp_irq_work_interrupt(void)
irq_work_run();
}
-__visible void smp_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_irq_work_interrupt();
exiting_irq();
}
-__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..f73f475d0573
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,213 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old_sysctl;
+ int ret;
+
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return -EINVAL;
+ }
+
+ old_sysctl = sysctl_sched_itmt_enabled;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+ {
+ .procname = "sched_itmt_enabled",
+ .data = &sysctl_sched_itmt_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_itmt_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {}
+};
+
+static struct ctl_table itmt_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = itmt_kern_table,
+ },
+ {}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return 0;
+ }
+
+ itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ if (!itmt_sysctl_header) {
+ mutex_unlock(&itmt_update_mutex);
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ x86_topology_update = true;
+ rebuild_sched_domains();
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return;
+ }
+ sched_itmt_capable = false;
+
+ if (itmt_sysctl_header) {
+ unregister_sysctl_table(itmt_sysctl_header);
+ itmt_sysctl_header = NULL;
+ }
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of cpu core
+ * @core_cpu: The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+ int cpu, i = 1;
+
+ for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+ int smt_prio;
+
+ /*
+ * Ensure that the siblings are moved to the end
+ * of the priority chain and only used when
+ * all other high priority cpus are out of capacity.
+ */
+ smt_prio = prio * smp_num_siblings / i;
+ per_cpu(sched_core_priority, cpu) = smt_prio;
+ i++;
+ }
+}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index fc25f698d792..c37bd0f39c70 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -32,8 +32,7 @@ static void bug_at(unsigned char *ip, int line)
* Something went wrong. Crash the box, as something could be
* corrupting the kernel.
*/
- pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n",
- ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line);
+ pr_crit("jump_label: Fatal kernel bug, unexpected op at %pS [%p] (%5ph) %d\n", ip, ip, ip, line);
BUG();
}
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index bdb83e431d89..38b64587b31b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -167,7 +167,7 @@ static int __init boot_params_kdebugfs_init(void)
struct dentry *dbp, *version, *data;
int error = -ENOMEM;
- dbp = debugfs_create_dir("boot_params", NULL);
+ dbp = debugfs_create_dir("boot_params", arch_debugfs_dir);
if (!dbp)
return -ENOMEM;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3407b148c240..d0a814a9d96a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
struct setup_header *header;
int setup_sects, kern16_size, ret = 0;
- unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ unsigned long setup_header_size, params_cmdline_sz;
struct boot_params *params;
unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
unsigned long purgatory_load_addr;
- unsigned long kernel_bufsz, kernel_memsz, kernel_align;
- char *kernel_buf;
struct bzimage64_data *ldata;
struct kexec_entry64_regs regs64;
void *stack;
unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
header = (struct setup_header *)(kernel + setup_hdr_offset);
setup_sects = header->setup_sects;
@@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
- params_misc_sz = params_cmdline_sz + efi_map_sz +
+ kbuf.bufsz = params_cmdline_sz + efi_map_sz +
sizeof(struct setup_data) +
sizeof(struct efi_setup_data);
- params = kzalloc(params_misc_sz, GFP_KERNEL);
+ params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
efi_map_offset = params_cmdline_sz;
@@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
/* Is there a limit on setup header size? */
memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
- ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
- params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
- ULONG_MAX, 1, &bootparam_load_addr);
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, params_misc_sz, params_misc_sz);
+ bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
/* Load kernel */
- kernel_buf = kernel + kern16_size;
- kernel_bufsz = kernel_len - kern16_size;
- kernel_memsz = PAGE_ALIGN(header->init_size);
- kernel_align = header->kernel_alignment;
-
- ret = kexec_add_buffer(image, kernel_buf,
- kernel_bufsz, kernel_memsz, kernel_align,
- MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
- &kernel_load_addr);
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ kernel_load_addr = kbuf.mem;
pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kernel_memsz, kernel_memsz);
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
- ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
- PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
- ULONG_MAX, 1, &initrd_load_addr);
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ initrd_load_addr = kbuf.mem;
pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
initrd_load_addr, initrd_len, initrd_len);
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index c6ee63f927ab..d688826e5736 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -67,7 +67,7 @@
#endif
/* Ensure if the instruction can be boostable */
-extern int can_boost(kprobe_opcode_t *instruction);
+extern int can_boost(kprobe_opcode_t *instruction, void *addr);
/* Recover instruction if given address is probed */
extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
unsigned long addr);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d9d8d16b69db..993fa4fe4f68 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,6 +45,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
+#include <linux/sched/debug.h>
#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
@@ -56,7 +57,7 @@
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/debugreg.h>
@@ -166,12 +167,12 @@ NOKPROBE_SYMBOL(skip_prefixes);
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes, void *addr)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
- if (search_exception_tables((unsigned long)opcodes))
+ if (search_exception_tables((unsigned long)addr))
return 0; /* Page fault may occur on this address. */
retry:
@@ -416,7 +417,7 @@ static int arch_copy_kprobe(struct kprobe *p)
* __copy_instruction can modify the displacement of the instruction,
* but it doesn't affect boostable check.
*/
- if (can_boost(p->ainsn.insn))
+ if (can_boost(p->ainsn.insn, p->addr))
p->ainsn.boostable = 0;
else
p->ainsn.boostable = -1;
@@ -745,7 +746,7 @@ __visible __used void *trampoline_handler(struct pt_regs *regs)
* will be the real return address, and all the rest will
* point to kretprobe_trampoline.
*/
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+ hlist_for_each_entry(ri, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 3bb4c5f021f6..3e7c6e5a08ff 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -33,7 +33,7 @@
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/debugreg.h>
@@ -178,7 +178,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src)
while (len < RELATIVEJUMP_SIZE) {
ret = __copy_instruction(dest + len, src + len);
- if (!ret || !can_boost(dest + len))
+ if (!ret || !can_boost(dest + len, src + len))
return -EINVAL;
len += ret;
}
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc854e39..14f65a5f938e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,7 +42,6 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#include <asm/idle.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
@@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
prev_state = exception_enter();
- exit_idle();
kvm_async_pf_task_wait((u32)read_cr2());
exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
- exit_idle();
kvm_async_pf_task_wake((u32)read_cr2());
rcu_irq_exit();
break;
@@ -308,7 +305,7 @@ static void kvm_register_steal_time(void)
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
-static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
/**
* This relies on __test_and_clear_bit to modify the memory
@@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
*/
if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
- apic_write(APIC_EOI, APIC_EOI_ACK);
+ apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
}
static void kvm_guest_cpu_init(void)
@@ -592,6 +589,38 @@ out:
local_irq_restore(flags);
}
+#ifdef CONFIG_X86_32
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!src->preempted;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+"ret;"
+".popsection");
+
+#endif
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -608,20 +637,11 @@ void __init kvm_spinlock_init(void)
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
-}
-static __init int kvm_spinlock_init_jump(void)
-{
- if (!kvm_para_available())
- return 0;
- if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
- return 0;
-
- static_key_slow_inc(&paravirt_ticketlocks_enabled);
- printk(KERN_INFO "KVM setup paravirtual spinlock\n");
-
- return 0;
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_lock_ops.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
}
-early_initcall(kvm_spinlock_init_jump);
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 60b9949f1e65..d88967659098 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -25,14 +25,16 @@
#include <linux/hardirq.h>
#include <linux/memblock.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
+#include <asm/kvmclock.h>
static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
-static cycle_t kvm_sched_clock_offset;
+static u64 kvm_sched_clock_offset;
static int parse_no_kvmclock(char *arg)
{
@@ -49,6 +51,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
return hv_clock;
}
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
/*
* The wallclock is the time of day when we booted. Since then, some time may
@@ -79,10 +82,10 @@ static int kvm_set_wallclock(const struct timespec *now)
return -1;
}
-static cycle_t kvm_clock_read(void)
+static u64 kvm_clock_read(void)
{
struct pvclock_vcpu_time_info *src;
- cycle_t ret;
+ u64 ret;
int cpu;
preempt_disable_notrace();
@@ -93,12 +96,12 @@ static cycle_t kvm_clock_read(void)
return ret;
}
-static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
+static u64 kvm_clock_get_cycles(struct clocksource *cs)
{
return kvm_clock_read();
}
-static cycle_t kvm_sched_clock_read(void)
+static u64 kvm_sched_clock_read(void)
{
return kvm_clock_read() - kvm_sched_clock_offset;
}
@@ -107,12 +110,12 @@ static inline void kvm_sched_clock_init(bool stable)
{
if (!stable) {
pv_time_ops.sched_clock = kvm_clock_read;
+ clear_sched_clock_stable();
return;
}
kvm_sched_clock_offset = kvm_clock_read();
pv_time_ops.sched_clock = kvm_sched_clock_read;
- set_sched_clock_stable();
printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
kvm_sched_clock_offset);
@@ -174,13 +177,14 @@ bool kvm_check_and_clear_guest_paused(void)
return ret;
}
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
+EXPORT_SYMBOL_GPL(kvm_clock);
int kvm_register_clock(char *txt)
{
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6707039b9032..d4a15831ac58 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm)
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int size)
{
struct ldt_struct *new_ldt;
- int alloc_size;
+ unsigned int alloc_size;
if (size > LDT_ENTRIES)
return NULL;
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
+ vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
@@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int oldsize, newsize;
+ struct user_desc ldt_info;
struct desc_struct ldt;
int error;
- struct user_desc ldt_info;
- int oldsize, newsize;
- struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
old_ldt = mm->context.ldt;
oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+ newsize = max(ldt_info.entry_number + 1, oldsize);
error = -ENOMEM;
new_ldt = alloc_ldt_struct(newsize);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 8c1f218926d7..857cdbd02867 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -194,19 +194,22 @@ static int arch_update_purgatory(struct kimage *image)
/* Setup copying of backup region */
if (image->type == KEXEC_TYPE_CRASH) {
- ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_dest",
&image->arch.backup_load_addr,
sizeof(image->arch.backup_load_addr), 0);
if (ret)
return ret;
- ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_src",
&image->arch.backup_src_start,
sizeof(image->arch.backup_src_start), 0);
if (ret)
return ret;
- ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ ret = kexec_purgatory_get_set_symbol(image,
+ "purgatory_backup_sz",
&image->arch.backup_src_sz,
sizeof(image->arch.backup_src_sz), 0);
if (ret)
@@ -328,7 +331,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
- VMCOREINFO_SYMBOL(phys_base);
+ VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_level4_pgt);
#ifdef CONFIG_NUMA
@@ -337,9 +340,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
- VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET);
- VMCOREINFO_VMALLOC_START(VMALLOC_START);
- VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7f3550acde1b..ef688804f80d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -44,6 +44,7 @@
#include <asm/msr.h>
static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -180,7 +181,7 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
-static int msr_device_create(int cpu)
+static int msr_device_create(unsigned int cpu)
{
struct device *dev;
@@ -189,34 +190,12 @@ static int msr_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void msr_device_destroy(int cpu)
+static int msr_device_destroy(unsigned int cpu)
{
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
}
-static int msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = msr_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- msr_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block __refdata msr_class_cpu_notifier = {
- .notifier_call = msr_class_cpu_callback,
-};
-
static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
@@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode)
static int __init msr_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
msr_class = class_create(THIS_MODULE, "msr");
if (IS_ERR(msr_class)) {
@@ -239,44 +216,27 @@ static int __init msr_init(void)
}
msr_class->devnode = msr_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = msr_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
-
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i)
- msr_device_destroy(i);
- cpu_notifier_register_done();
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-out:
return err;
}
+module_init(msr_init);
static void __exit msr_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- msr_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(msr_init);
module_exit(msr_exit)
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bfe4d6c96fbd..a723ae9440ab 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -13,6 +13,7 @@
#include <linux/spinlock.h>
#include <linux/kprobes.h>
#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/debugfs.h>
#include <linux/delay.h>
@@ -20,6 +21,7 @@
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/sched/clock.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
@@ -164,11 +166,9 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
spin_lock_irqsave(&desc->lock, flags);
/*
- * most handlers of type NMI_UNKNOWN never return because
- * they just assume the NMI is theirs. Just a sanity check
- * to manage expectations
+ * Indicate if there are multiple registrations on the
+ * internal NMI handler call chains (SERR and IO_CHECK).
*/
- WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a003b793..8f2d1c9d43a8 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
}
-
PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
bool pv_is_native_spin_unlock(void)
@@ -21,15 +20,25 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
}
+__visible bool __native_vcpu_is_preempted(long cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_lock_ops.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+ .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
-
-struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bbf3d5933eaa..4797e87b0fb6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
- .clts = native_clts,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
@@ -426,6 +425,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_clear = native_pmd_clear,
#endif
.set_pud = native_set_pud,
+ .set_pud_at = native_set_pud_at,
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 920c6ae08592..553acbbb4d32 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
@@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default: __maybe_unused
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index bb3840cedb4f..11aaf1eaa0e4 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
@@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default: __maybe_unused
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 5d400ba1349d..0c150c06fa5a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -296,7 +296,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
/* were we called with bad_dma_address? */
badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
- if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
+ if (unlikely(dma_addr < badend)) {
WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
"address 0x%Lx\n", dma_addr);
return;
@@ -478,7 +478,7 @@ static void calgary_free_coherent(struct device *dev, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
-static struct dma_map_ops calgary_dma_ops = {
+static const struct dma_map_ops calgary_dma_ops = {
.alloc = calgary_alloc_coherent,
.free = calgary_free_coherent,
.map_sg = calgary_map_sg,
@@ -1177,7 +1177,7 @@ static int __init calgary_init(void)
tbl = find_iommu_table(&dev->dev);
if (translation_enabled(tbl))
- dev->dev.archdata.dma_ops = &calgary_dma_ops;
+ dev->dev.dma_ops = &calgary_dma_ops;
}
return ret;
@@ -1201,7 +1201,7 @@ error:
calgary_disable_translation(dev);
calgary_free_bus(dev);
pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
- dev->dev.archdata.dma_ops = NULL;
+ dev->dev.dma_ops = NULL;
} while (1);
return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d30c37750765..3a216ec869cd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -17,7 +17,7 @@
static int forbid_dac __read_mostly;
-struct dma_map_ops *dma_ops = &nommu_dma_ops;
+const struct dma_map_ops *dma_ops = &nommu_dma_ops;
EXPORT_SYMBOL(dma_ops);
static int iommu_sac_force __read_mostly;
@@ -91,7 +91,8 @@ again:
page = NULL;
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(flag)) {
- page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ page = dma_alloc_from_contiguous(dev, count, get_order(size),
+ flag);
if (page && page_to_phys(page) + size > dma_mask) {
dma_release_from_contiguous(dev, page, count);
page = NULL;
@@ -214,7 +215,7 @@ early_param("iommu", iommu_setup);
int dma_supported(struct device *dev, u64 mask)
{
- struct dma_map_ops *ops = get_dma_ops(dev);
+ const struct dma_map_ops *ops = get_dma_ops(dev);
#ifdef CONFIG_PCI
if (mask > 0xffffffff && forbid_dac > 0) {
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 00e71ce396a8..a88952ef371c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -88,7 +88,7 @@ static void nommu_sync_sg_for_device(struct device *dev,
flush_write_buffers();
}
-struct dma_map_ops nommu_dma_ops = {
+const struct dma_map_ops nommu_dma_ops = {
.alloc = dma_generic_alloc_coherent,
.free = dma_generic_free_coherent,
.map_sg = nommu_map_sg,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index b47edb8f5256..1e23577e17cf 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -45,7 +45,7 @@ void x86_swiotlb_free_coherent(struct device *dev, size_t size,
dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
-static struct dma_map_ops swiotlb_dma_ops = {
+static const struct dma_map_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc = x86_swiotlb_alloc_coherent,
.free = x86_swiotlb_free_coherent,
@@ -68,12 +68,10 @@ static struct dma_map_ops swiotlb_dma_ops = {
*/
int __init pci_swiotlb_detect_override(void)
{
- int use_swiotlb = swiotlb | swiotlb_force;
-
- if (swiotlb_force)
+ if (swiotlb_force == SWIOTLB_FORCE)
swiotlb = 1;
- return use_swiotlb;
+ return swiotlb;
}
IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
pci_xen_swiotlb_detect,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index da8cb987b973..587d887f7f17 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -1,6 +1,7 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/perf_event.h>
#include <linux/bug.h>
#include <linux/stddef.h>
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 24a50301f150..91271122f0df 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -6,6 +6,7 @@
void __init x86_early_init_platform_quirks(void)
{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
x86_platform.legacy.rtc = 1;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 1;
@@ -16,10 +17,14 @@ void __init x86_early_init_platform_quirks(void)
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
case X86_SUBARCH_INTEL_MID:
case X86_SUBARCH_CE4100:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
break;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..f67591561711 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,10 @@
#include <linux/prctl.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/pm.h>
@@ -23,8 +27,7 @@
#include <asm/cpu.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
-#include <asm/idle.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/mwait.h>
#include <asm/fpu/internal.h>
#include <asm/debugreg.h>
@@ -33,6 +36,7 @@
#include <asm/mce.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
+#include <asm/desc.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -65,22 +69,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
-#ifdef CONFIG_X86_64
-static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-#endif
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
/*
* this gets called so that we can store lazy state into memory and copy the
@@ -227,6 +217,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
+
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_tss_limit();
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
@@ -251,39 +247,10 @@ static inline void play_dead(void)
}
#endif
-#ifdef CONFIG_X86_64
-void enter_idle(void)
-{
- this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
-}
-#endif
-
void arch_cpu_idle_enter(void)
{
+ tsc_verify_tsc_adjust(false);
local_touch_nmi();
- enter_idle();
-}
-
-void arch_cpu_idle_exit(void)
-{
- __exit_idle();
}
void arch_cpu_idle_dead(void)
@@ -336,59 +303,33 @@ void stop_this_cpu(void *dummy)
halt();
}
-bool amd_e400_c1e_detected;
-EXPORT_SYMBOL(amd_e400_c1e_detected);
-
-static cpumask_var_t amd_e400_c1e_mask;
-
-void amd_e400_remove_cpu(int cpu)
-{
- if (amd_e400_c1e_mask != NULL)
- cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
-}
-
/*
- * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
*/
static void amd_e400_idle(void)
{
- if (!amd_e400_c1e_detected) {
- u32 lo, hi;
-
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- amd_e400_c1e_detected = true;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- pr_info("System has AMD C1E enabled\n");
- }
+ /*
+ * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+ * gets set after static_cpu_has() places have been converted via
+ * alternatives.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ default_idle();
+ return;
}
- if (amd_e400_c1e_detected) {
- int cpu = smp_processor_id();
-
- if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
- cpumask_set_cpu(cpu, amd_e400_c1e_mask);
- /* Force broadcast so ACPI can not interfere. */
- tick_broadcast_force();
- pr_info("Switch to broadcast mode on CPU%d\n", cpu);
- }
- tick_broadcast_enter();
+ tick_broadcast_enter();
- default_idle();
+ default_idle();
- /*
- * The switch back from broadcast mode needs to be
- * called with interrupts disabled.
- */
- local_irq_disable();
- tick_broadcast_exit();
- local_irq_enable();
- } else
- default_idle();
+ /*
+ * The switch back from broadcast mode needs to be called with
+ * interrupts disabled.
+ */
+ local_irq_disable();
+ tick_broadcast_exit();
+ local_irq_enable();
}
/*
@@ -448,8 +389,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
if (x86_idle || boot_option_idle_override == IDLE_POLL)
return;
- if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
- /* E400: APIC timer interrupt does not wake up CPU from C1e */
+ if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
} else if (prefer_mwait_c1_over_halt(c)) {
@@ -459,11 +399,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
x86_idle = default_idle;
}
-void __init init_amd_e400_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
{
- /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
- if (x86_idle == amd_e400_idle)
- zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ pr_info("System has AMD C1E enabled\n");
}
static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8efdc4c..4c818f8bc135 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -12,6 +12,8 @@
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -49,11 +51,11 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
+#include <asm/intel_rdt.h>
void __show_regs(struct pt_regs *regs, int all)
{
@@ -72,10 +74,9 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}
- printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
+ printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
+ printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
+ smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -232,11 +233,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- fpu_switch_t fpu_switch;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -295,9 +295,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
this_cpu_write(current_task, next_p);
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..d6b784a5520d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -17,6 +17,8 @@
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
@@ -44,12 +46,12 @@
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
+#include <asm/intel_rdt.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -61,10 +63,15 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
+ (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
+ regs->sp, regs->flags);
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -265,9 +272,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
- fpu_switch_t fpu_switch;
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -417,7 +423,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
/*
* Switch the PDA and FPU contexts.
@@ -473,6 +479,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
loadsegment(ss, __KERNEL_DS);
}
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0e63c0267f99..2364b23ea3e5 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -6,6 +6,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
@@ -24,7 +25,7 @@
#include <linux/export.h>
#include <linux/context_tracking.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5b2cc889ce34..5c3f6d6a5078 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -21,6 +21,8 @@
#include <linux/sched.h>
#include <linux/gfp.h>
#include <linux/bootmem.h>
+#include <linux/nmi.h>
+
#include <asm/fixmap.h>
#include <asm/pvclock.h>
@@ -71,10 +73,10 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
return flags & valid_flags;
}
-cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
unsigned version;
- cycle_t ret;
+ u64 ret;
u64 last;
u8 flags;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e244c19a2451..067f9813fd2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -223,6 +223,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
},
},
+ { /* Handle problems with rebooting on ASUS EeeBook X205TA */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TA",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TA"),
+ },
+ },
+ { /* Handle problems with rebooting on ASUS EeeBook X205TAW */
+ .callback = set_acpi_reboot,
+ .ident = "ASUS EeeBook X205TAW",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X205TAW"),
+ },
+ },
/* Certec */
{ /* Handle problems with rebooting on Certec BPC600 */
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 79c6311cd912..5b21cb7d84d6 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now)
unsigned int status, year, mon, day, hour, min, sec, century = 0;
unsigned long flags;
+ /*
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
+ */
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
+
spin_lock_irqsave(&rtc_lock, flags);
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0e8ba7..4bf0c8926a1c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -575,7 +575,9 @@ static void __init reserve_crashkernel(void)
/* 0 means: find the address automatically */
if (crash_base <= 0) {
/*
- * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
+ * Set CRASH_ADDR_LOW_MAX upper bound for crash memory,
+ * as old kexec-tools loads bzImage below that, unless
+ * "crashkernel=size[KMG],high" is specified.
*/
crash_base = memblock_find_in_range(CRASH_ALIGN,
high ? CRASH_ADDR_HIGH_MAX
@@ -985,6 +987,30 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
x86_report_nx();
/* after early param, so could get panic from serial */
@@ -1152,6 +1178,20 @@ void __init setup_arch(char **cmdline_p)
/* Allocate bigger log buffer */
setup_log_buf(1);
+ if (efi_enabled(EFI_BOOT)) {
+ switch (boot_params.secure_boot) {
+ case efi_secureboot_mode_disabled:
+ pr_info("Secure boot disabled\n");
+ break;
+ case efi_secureboot_mode_enabled:
+ pr_info("Secure boot enabled\n");
+ break;
+ default:
+ pr_info("Secure boot could not be determined\n");
+ break;
+ }
+ }
+
reserve_initrd();
acpi_table_upgrade();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2bbd27f89802..9820d6d977c6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,7 +1,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
@@ -12,6 +12,7 @@
#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 763af1d0de64..cc30a74e4adb 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -10,6 +10,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/kernel.h>
@@ -845,7 +846,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
me->comm, me->pid, where, frame,
regs->ip, regs->sp, regs->orig_ax);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index ec1f756f9dc9..71beb28600d4 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -151,8 +151,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
if (from->si_signo == SIGSEGV) {
if (from->si_code == SEGV_BNDERR) {
- compat_uptr_t lower = (unsigned long)&to->si_lower;
- compat_uptr_t upper = (unsigned long)&to->si_upper;
+ compat_uptr_t lower = (unsigned long)from->si_lower;
+ compat_uptr_t upper = (unsigned long)from->si_upper;
put_user_ex(lower, &to->si_lower);
put_user_ex(upper, &to->si_upper);
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index c00cb64bc0a1..d3c66a15bbde 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -259,18 +259,16 @@ static inline void __smp_reschedule_interrupt(void)
scheduler_ipi();
}
-__visible void smp_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
{
- irq_enter();
ack_APIC_irq();
__smp_reschedule_interrupt();
- irq_exit();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
}
-__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_trace_reschedule_interrupt(struct pt_regs *regs)
{
/*
* Need to call irq_enter() before calling the trace point.
@@ -294,14 +292,15 @@ static inline void __smp_call_function_interrupt(void)
inc_irq_stat(irq_call_count);
}
-__visible void smp_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_call_function_interrupt();
exiting_irq();
}
-__visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
@@ -316,14 +315,16 @@ static inline void __smp_call_function_single_interrupt(void)
inc_irq_stat(irq_call_count);
}
-__visible void smp_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_call_function_single_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
__smp_call_function_single_interrupt();
exiting_irq();
}
-__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
+__visible void __irq_entry
+smp_trace_call_function_single_interrupt(struct pt_regs *regs)
{
ipi_entering_ack_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42f5eb7b4f6c..bd1f1ad35284 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -45,6 +45,9 @@
#include <linux/smp.h>
#include <linux/export.h>
#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/task_stack.h>
#include <linux/percpu.h>
#include <linux/bootmem.h>
#include <linux/err.h>
@@ -58,7 +61,6 @@
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
@@ -104,11 +106,21 @@ static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
-static bool logical_packages_frozen __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+ int retval = x86_topology_update;
+
+ x86_topology_update = false;
+ return retval;
+}
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -263,9 +275,14 @@ static void notrace start_secondary(void *unused)
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+/**
+ * topology_update_package_map - Update the physical to logical package map
+ * @pkg: The physical package id as retrieved via CPUID
+ * @cpu: The cpu for which this is updated
+ */
+int topology_update_package_map(unsigned int pkg, unsigned int cpu)
{
- unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+ unsigned int new;
/* Called from early boot ? */
if (!physical_package_map)
@@ -278,16 +295,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- if (logical_packages_frozen) {
- physical_to_logical_pkg[pkg] = -1;
- pr_warn("APIC(%x) Package %u exceeds logical package max\n",
- apicid, pkg);
+ if (logical_packages >= __max_logical_packages) {
+ pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n",
+ logical_packages, cpu, __max_logical_packages);
return -ENOSPC;
}
new = logical_packages++;
- pr_info("APIC(%x) Converting physical %u to logical package %u\n",
- apicid, pkg, new);
+ if (new != pkg) {
+ pr_info("CPU %u Converting physical %u to logical package %u\n",
+ cpu, pkg, new);
+ }
physical_to_logical_pkg[pkg] = new;
found:
@@ -308,9 +326,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
}
EXPORT_SYMBOL(topology_phys_to_logical_pkg);
-static void __init smp_init_package_map(void)
+static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu)
{
- unsigned int ncpus, cpu;
+ unsigned int ncpus;
size_t size;
/*
@@ -355,27 +373,9 @@ static void __init smp_init_package_map(void)
size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
physical_package_map = kzalloc(size, GFP_KERNEL);
- for_each_present_cpu(cpu) {
- unsigned int apicid = apic->cpu_present_to_apicid(cpu);
-
- if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
- continue;
- if (!topology_update_package_map(apicid, cpu))
- continue;
- pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
- per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
- set_cpu_possible(cpu, false);
- set_cpu_present(cpu, false);
- }
-
- if (logical_packages > __max_logical_packages) {
- pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
- logical_packages, __max_logical_packages);
- logical_packages_frozen = true;
- __max_logical_packages = logical_packages;
- }
-
pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+ topology_update_package_map(c->phys_proc_id, cpu);
}
void __init smp_store_boot_cpu_info(void)
@@ -385,7 +385,7 @@ void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
- smp_init_package_map();
+ smp_init_package_map(c, id);
}
/*
@@ -436,9 +436,15 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
if (c->phys_proc_id == o->phys_proc_id &&
- per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
- c->cpu_core_id == o->cpu_core_id)
- return topology_sane(c, o, "smt");
+ per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
+ if (c->cpu_core_id == o->cpu_core_id)
+ return topology_sane(c, o, "smt");
+
+ if ((c->cu_id != 0xff) &&
+ (o->cu_id != 0xff) &&
+ (c->cu_id == o->cu_id))
+ return topology_sane(c, o, "smt");
+ }
} else if (c->phys_proc_id == o->phys_proc_id &&
c->cpu_core_id == o->cpu_core_id) {
@@ -471,22 +477,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+ return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ NULL, },
};
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
@@ -821,14 +847,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-void smp_announce(void)
-{
- int num_nodes = num_online_nodes();
-
- printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
- num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
@@ -964,9 +982,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int cpu0_nmi_registered = 0;
unsigned long timeout;
- idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(idle))) - 1);
-
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
@@ -1111,7 +1127,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
return err;
/* the FPU context is blank, nobody can own it */
- __cpu_disable_lazy_restore(cpu);
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
common_cpu_up(cpu, tidle);
@@ -1331,11 +1347,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
- pr_info("CPU%d: ", 0);
+ pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
- if (is_uv_system())
- uv_system_init();
+ uv_system_init();
set_mtrr_aps_delayed_init();
@@ -1456,15 +1471,15 @@ __init void prefill_possible_map(void)
possible = i;
}
+ nr_cpu_ids = possible;
+
pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
+ reset_cpu_possible_mask();
+
for (i = 0; i < possible; i++)
set_cpu_possible(i, true);
- for (; i < NR_CPUS; i++)
- set_cpu_possible(i, false);
-
- nr_cpu_ids = possible;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1575,7 +1590,6 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- amd_e400_remove_cpu(raw_smp_processor_id());
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 0653788026e2..8e2b79b88e51 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -4,6 +4,8 @@
* Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*/
#include <linux/sched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <linux/export.h>
#include <linux/uaccess.h>
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index a23ce84a3f6c..f07f83b3611b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -2,6 +2,7 @@
* x86 single-step support code, common to 32-bit and 64-bit.
*/
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a55ed63b9f91..50215a4b9347 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -1,5 +1,6 @@
#include <linux/errno.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/fs.h>
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 8402907825b0..b868fa1b812b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -408,7 +408,7 @@ static __init int tboot_late_init(void)
tboot_create_trampoline();
atomic_set(&ap_wfs_count, 0);
- cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL,
+ cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "x86/tboot:dying", NULL,
tboot_dying_cpu);
#ifdef CONFIG_DEBUG_FS
debugfs_create_file("tboot_log", S_IRUSR,
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
deleted file mode 100644
index 27538f183c3b..000000000000
--- a/arch/x86/kernel/test_nx.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * test_nx.c: functional test for NX functionality
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <linux/module.h>
-#include <linux/sort.h>
-#include <linux/slab.h>
-
-#include <asm/uaccess.h>
-#include <asm/asm.h>
-
-extern int rodata_test_data;
-
-/*
- * This file checks 4 things:
- * 1) Check if the stack is not executable
- * 2) Check if kmalloc memory is not executable
- * 3) Check if the .rodata section is not executable
- * 4) Check if the .data section of a module is not executable
- *
- * To do this, the test code tries to execute memory in stack/kmalloc/etc,
- * and then checks if the expected trap happens.
- *
- * Sadly, this implies having a dynamic exception handling table entry.
- * ... which can be done (and will make Rusty cry)... but it can only
- * be done in a stand-alone module with only 1 entry total.
- * (otherwise we'd have to sort and that's just too messy)
- */
-
-
-
-/*
- * We want to set up an exception handling point on our stack,
- * which means a variable value. This function is rather dirty
- * and walks the exception table of the module, looking for a magic
- * marker and replaces it with a specific function.
- */
-static void fudze_exception_table(void *marker, void *new)
-{
- struct module *mod = THIS_MODULE;
- struct exception_table_entry *extable;
-
- /*
- * Note: This module has only 1 exception table entry,
- * so searching and sorting is not needed. If that changes,
- * this would be the place to search and re-sort the exception
- * table.
- */
- if (mod->num_exentries > 1) {
- printk(KERN_ERR "test_nx: too many exception table entries!\n");
- printk(KERN_ERR "test_nx: test results are not reliable.\n");
- return;
- }
- extable = (struct exception_table_entry *)mod->extable;
- extable[0].insn = (unsigned long)new;
-}
-
-
-/*
- * exception tables get their symbols translated so we need
- * to use a fake function to put in there, which we can then
- * replace at runtime.
- */
-void foo_label(void);
-
-/*
- * returns 0 for not-executable, negative for executable
- *
- * Note: we cannot allow this function to be inlined, because
- * that would give us more than 1 exception table entry.
- * This in turn would break the assumptions above.
- */
-static noinline int test_address(void *address)
-{
- unsigned long result;
-
- /* Set up an exception table entry for our address */
- fudze_exception_table(&foo_label, address);
- result = 1;
- asm volatile(
- "foo_label:\n"
- "0: call *%[fake_code]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: mov %[zero], %[rslt]\n"
- " ret\n"
- ".previous\n"
- _ASM_EXTABLE(0b,2b)
- : [rslt] "=r" (result)
- : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
- );
- /* change the exception table back for the next round */
- fudze_exception_table(address, &foo_label);
-
- if (result)
- return -ENODEV;
- return 0;
-}
-
-static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
-
-static int test_NX(void)
-{
- int ret = 0;
- /* 0xC3 is the opcode for "ret" */
- char stackcode[] = {0xC3, 0x90, 0 };
- char *heap;
-
- test_data = 0xC3;
-
- printk(KERN_INFO "Testing NX protection\n");
-
- /* Test 1: check if the stack is not executable */
- if (test_address(&stackcode)) {
- printk(KERN_ERR "test_nx: stack was executable\n");
- ret = -ENODEV;
- }
-
-
- /* Test 2: Check if the heap is executable */
- heap = kmalloc(64, GFP_KERNEL);
- if (!heap)
- return -ENOMEM;
- heap[0] = 0xC3; /* opcode for "ret" */
-
- if (test_address(heap)) {
- printk(KERN_ERR "test_nx: heap was executable\n");
- ret = -ENODEV;
- }
- kfree(heap);
-
- /*
- * The following 2 tests currently fail, this needs to get fixed
- * Until then, don't run them to avoid too many people getting scared
- * by the error message
- */
-
- /* Test 3: Check if the .rodata section is executable */
- if (rodata_test_data != 0xC3) {
- printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
- ret = -ENODEV;
- } else if (test_address(&rodata_test_data)) {
- printk(KERN_ERR "test_nx: .rodata section is executable\n");
- ret = -ENODEV;
- }
-
-#if 0
- /* Test 4: Check if the .data section of a module is executable */
- if (test_address(&test_data)) {
- printk(KERN_ERR "test_nx: .data section is executable\n");
- ret = -ENODEV;
- }
-
-#endif
- return ret;
-}
-
-static void test_exit(void)
-{
-}
-
-module_init(test_NX);
-module_exit(test_exit);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Testcase for the NX infrastructure");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
deleted file mode 100644
index 222e84e2432e..000000000000
--- a/arch/x86/kernel/test_rodata.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * test_rodata.c: functional test for mark_rodata_ro function
- *
- * (C) Copyright 2008 Intel Corporation
- * Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-#include <asm/cacheflush.h>
-#include <asm/sections.h>
-#include <asm/asm.h>
-
-int rodata_test(void)
-{
- unsigned long result;
- unsigned long start, end;
-
- /* test 1: read the value */
- /* If this test fails, some previous testrun has clobbered the state */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
- return -ENODEV;
- }
-
- /* test 2: write to the variable; this should fault */
- /*
- * If this test fails, we managed to overwrite the data
- *
- * This is written in assembly to be able to catch the
- * exception that is supposed to happen in the correct
- * case
- */
-
- result = 1;
- asm volatile(
- "0: mov %[zero],(%[rodata_test])\n"
- " mov %[zero], %[rslt]\n"
- "1:\n"
- ".section .fixup,\"ax\"\n"
- "2: jmp 1b\n"
- ".previous\n"
- _ASM_EXTABLE(0b,2b)
- : [rslt] "=r" (result)
- : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
- );
-
-
- if (!result) {
- printk(KERN_ERR "rodata_test: test data was not read only\n");
- return -ENODEV;
- }
-
- /* test 3: check the value hasn't changed */
- /* If this test fails, we managed to overwrite the data */
- if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
- return -ENODEV;
- }
- /* test 4: check if the rodata section is 4Kb aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
- return -ENODEV;
- }
- if (end & (PAGE_SIZE - 1)) {
- printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
- return -ENODEV;
- }
-
- return 0;
-}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9692a5e9fdab..6c8934406dc9 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -5,7 +5,7 @@
#include <linux/regset.h>
#include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..15515132bf0d 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -34,7 +34,7 @@ static void switch_idt(void *arg)
local_irq_restore(flags);
}
-void trace_irq_vector_regfunc(void)
+int trace_irq_vector_regfunc(void)
{
mutex_lock(&irq_vector_mutex);
if (!trace_irq_vector_refcount) {
@@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void)
}
trace_irq_vector_refcount++;
mutex_unlock(&irq_vector_mutex);
+ return 0;
}
void trace_irq_vector_unregfunc(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd4e3d4d3625..4e496379a871 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -29,6 +29,7 @@
#include <linux/errno.h>
#include <linux/kexec.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/timer.h>
#include <linux/init.h>
#include <linux/bug.h>
@@ -254,7 +255,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx",
tsk->comm, tsk->pid, str,
regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
@@ -518,7 +519,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx",
tsk->comm, task_pid_nr(tsk),
regs->ip, regs->sp, error_code);
- print_vma_addr(" in ", regs->ip);
+ print_vma_addr(KERN_CONT " in ", regs->ip);
pr_cont("\n");
}
@@ -563,11 +564,9 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
- preempt_disable();
cond_local_irq_enable(regs);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
cond_local_irq_disable(regs);
- preempt_enable_no_resched();
debug_stack_usage_dec();
exit:
ist_exit(regs);
@@ -742,14 +741,12 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_inc();
/* It's safe to allow irq's after DR6 has been saved */
- preempt_disable();
cond_local_irq_enable(regs);
if (v8086_mode(regs)) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
cond_local_irq_disable(regs);
- preempt_enable_no_resched();
debug_stack_usage_dec();
goto exit;
}
@@ -769,7 +766,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
cond_local_irq_disable(regs);
- preempt_enable_no_resched();
debug_stack_usage_dec();
exit:
@@ -853,6 +849,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ unsigned long cr0;
+
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
@@ -866,10 +864,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
return;
}
#endif
- fpu__restore(&current->thread.fpu); /* interrupts still off */
-#ifdef CONFIG_X86_32
- cond_local_irq_enable(regs);
-#endif
+
+ /* This should not happen. */
+ cr0 = read_cr0();
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, error_code);
+ }
}
NOKPROBE_SYMBOL(do_device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 46b2f41f8b05..714dfba6a1e7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/clock.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/timer.h>
@@ -326,9 +327,16 @@ unsigned long long sched_clock(void)
{
return paravirt_sched_clock();
}
+
+bool using_native_sched_clock(void)
+{
+ return pv_time_ops.sched_clock == native_sched_clock;
+}
#else
unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
+
+bool using_native_sched_clock(void) { return true; }
#endif
int check_tsc_unstable(void)
@@ -694,6 +702,7 @@ unsigned long native_calibrate_tsc(void)
crystal_khz = 24000; /* 24.0 MHz */
break;
case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_FAM6_ATOM_DENVERTON:
crystal_khz = 25000; /* 25.0 MHz */
break;
case INTEL_FAM6_ATOM_GOLDMONT:
@@ -702,6 +711,20 @@ unsigned long native_calibrate_tsc(void)
}
}
+ /*
+ * TSC frequency determined by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
return crystal_khz * ebx_numerator / eax_denominator;
}
@@ -1043,18 +1066,20 @@ static void detect_art(void)
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;
- cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
- &art_to_tsc_numerator, unused, unused+1);
-
- /* Don't enable ART in a VM, non-stop TSC required */
+ /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
- art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;
- if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+ cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+ &art_to_tsc_numerator, unused, unused+1);
+
+ if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
return;
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
}
@@ -1064,6 +1089,11 @@ static void detect_art(void)
static struct clocksource clocksource_tsc;
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
+
/*
* We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
@@ -1080,9 +1110,21 @@ static struct clocksource clocksource_tsc;
* checking the result of read_tsc() - cycle_last for being negative.
* That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
-static cycle_t read_tsc(struct clocksource *cs)
+static u64 read_tsc(struct clocksource *cs)
{
- return (cycle_t)rdtsc_ordered();
+ return (u64)rdtsc_ordered();
+}
+
+static void tsc_cs_mark_unstable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
+ clear_sched_clock_stable();
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
/*
@@ -1096,22 +1138,26 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
.archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
+ .mark_unstable = tsc_cs_mark_unstable,
};
void mark_tsc_unstable(char *reason)
{
- if (!tsc_unstable) {
- tsc_unstable = 1;
+ if (tsc_unstable)
+ return;
+
+ tsc_unstable = 1;
+ if (using_native_sched_clock())
clear_sched_clock_stable();
- disable_sched_clock_irqtime();
- pr_info("Marking TSC unstable due to %s\n", reason);
- /* Change only the rating, when not registered */
- if (clocksource_tsc.mult)
- clocksource_mark_unstable(&clocksource_tsc);
- else {
- clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
- clocksource_tsc.rating = 0;
- }
+ disable_sched_clock_irqtime();
+ pr_info("Marking TSC unstable due to %s\n", reason);
+ /* Change only the rating, when not registered */
+ if (clocksource_tsc.mult) {
+ clocksource_mark_unstable(&clocksource_tsc);
+ } else {
+ clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
+ clocksource_tsc.rating = 0;
}
}
@@ -1170,7 +1216,7 @@ int unsynchronized_tsc(void)
/*
* Convert ART to TSC given numerator/denominator found in detect_art()
*/
-struct system_counterval_t convert_art_to_tsc(cycle_t art)
+struct system_counterval_t convert_art_to_tsc(u64 art)
{
u64 tmp, res, rem;
@@ -1283,10 +1329,12 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
/*
- * Trust the results of the earlier calibration on systems
- * exporting a reliable TSC.
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
*/
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+ if (boot_cpu_has(X86_FEATURE_ART))
+ art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
return 0;
}
@@ -1333,6 +1381,9 @@ void __init tsc_init(void)
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
+ /* Sanitize TSC ADJUST before cyc2ns gets initialized */
+ tsc_store_and_check_tsc_adjust(true);
+
/*
* Secondary CPUs do not run through tsc_init(), so set up
* all the scale factors for all CPUs, assuming the same
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 0fe720d64fef..19afdbd7d0a7 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void)
#ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
return res;
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 78083bf23ed1..728f75378475 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -14,18 +14,166 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
+#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * But we always force positive ADJUST values. Otherwise the TSC
+ * deadline timer creates an interrupt storm. We also have to
+ * prevent values > 0x7FFFFFFF as those wreckage the timer as well.
+ */
+ if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
+ (bootval > 0x7FFFFFFF)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
+ bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval) {
+ pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
+ refcpu, ref->bootval, cpu, bootval);
+ }
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
+ refcpu, ref->adjusted, cpu, bootval);
+ cur->adjusted = ref->adjusted;
+ wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
static atomic_t start_count;
static atomic_t stop_count;
+static atomic_t skip_test;
+static atomic_t test_runs;
/*
* We use a raw spinlock in this exceptional case, because
@@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static cycles_t last_tsc;
static cycles_t max_warp;
static int nr_warps;
+static int random_warps;
/*
* TSC-warp measurement loop running on both CPUs. This is not called
* if there is no TSC.
*/
-static void check_tsc_warp(unsigned int timeout)
+static cycles_t check_tsc_warp(unsigned int timeout)
{
- cycles_t start, now, prev, end;
- int i;
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
start = rdtsc_ordered();
/*
@@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout)
if (unlikely(prev > now)) {
arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
nr_warps++;
+ cur_warps = nr_warps;
arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
now-start, end-start);
+ return cur_max_warp;
}
/*
@@ -128,23 +286,27 @@ void check_tsc_sync_source(int cpu)
if (unsynchronized_tsc())
return;
- if (tsc_clocksource_reliable) {
- if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
- pr_info(
- "Skipped synchronization checks as TSC is reliable.\n");
- return;
- }
-
/*
- * Reset it - in case this is a second bootup:
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
*/
- atomic_set(&stop_count, 0);
-
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
/*
- * Wait for the target to arrive:
+ * Wait for the target to start or to skip the test:
*/
- while (atomic_read(&start_count) != cpus-1)
+ while (atomic_read(&start_count) != cpus - 1) {
+ if (atomic_read(&skip_test) > 0) {
+ atomic_set(&skip_test, 0);
+ return;
+ }
cpu_relax();
+ }
+
/*
* Trigger the target to continue into the measurement too:
*/
@@ -155,21 +317,35 @@ void check_tsc_sync_source(int cpu)
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
- if (nr_warps) {
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ smp_processor_id(), cpu);
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
smp_processor_id(), cpu);
pr_warning("Measured %Ld cycles TSC warp between CPUs, "
"turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warning("TSC warped randomly between CPUs\n");
mark_tsc_unstable("check_tsc_sync_source failed");
- } else {
- pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
- smp_processor_id(), cpu);
}
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
+ random_warps = 0;
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
@@ -178,6 +354,12 @@ void check_tsc_sync_source(int cpu)
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
}
/*
@@ -185,12 +367,30 @@ void check_tsc_sync_source(int cpu)
*/
void check_tsc_sync_target(void)
{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;
/* Also aborts if there is no TSC. */
- if (unsynchronized_tsc() || tsc_clocksource_reliable)
+ if (unsynchronized_tsc())
+ return;
+
+ /*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ *
+ * The test is also skipped when the TSC is marked reliable. This
+ * is true for SoCs which have no fallback clocksource. On these
+ * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
+ * register might have been wreckaged by the BIOS..
+ */
+ if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
+ atomic_inc(&skip_test);
return;
+ }
+retry:
/*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
@@ -199,7 +399,12 @@ void check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp(loop_timeout(smp_processor_id()));
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
/*
* Ok, we are done:
@@ -211,4 +416,61 @@ void check_tsc_sync_target(void)
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustement value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ /*
+ * TSC deadline timer stops working or creates an interrupt storm
+ * with adjust values < 0 and > x07ffffff.
+ *
+ * To allow adjust values > 0x7FFFFFFF we need to disable the
+ * deadline timer and use the local APIC timer, but that requires
+ * more intrusive changes and we do not have any useful information
+ * from Intel about the underlying HW wreckage yet.
+ */
+ if (cur->adjusted < 0)
+ cur->adjusted = 0;
+ if (cur->adjusted > 0x7FFFFFFF)
+ cur->adjusted = 0x7FFFFFFF;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a2456d4d286a..08339262b666 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -1,4 +1,6 @@
#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/task_stack.h>
#include <asm/ptrace.h>
#include <asm/bitops.h>
#include <asm/stacktrace.h>
@@ -6,6 +8,52 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+/*
+ * This disables KASAN checking when reading a value from another task's stack,
+ * since the other task could be running on another CPU and could have poisoned
+ * the stack in the meantime.
+ */
+#define READ_ONCE_TASK_STACK(task, x) \
+({ \
+ unsigned long val; \
+ if (task == current) \
+ val = READ_ONCE(x); \
+ else \
+ val = READ_ONCE_NOCHECK(x); \
+ val; \
+})
+
+static void unwind_dump(struct unwind_state *state, unsigned long *sp)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = state->orig_sp; sp < state->stack_info.end; sp++) {
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %016x ...\n", sp, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word);
+ }
+}
+
unsigned long unwind_get_return_address(struct unwind_state *state)
{
unsigned long addr;
@@ -14,17 +62,84 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
if (unwind_done(state))
return 0;
- addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+ if (state->regs && user_mode(state->regs))
+ return 0;
+
+ addr = READ_ONCE_TASK_STACK(state->task, *addr_p);
+ addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr,
addr_p);
return __kernel_text_address(addr) ? addr : 0;
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+static size_t regs_size(struct pt_regs *regs)
+{
+ /* x86_32 regs from kernel mode are two words shorter: */
+ if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+ return sizeof(*regs) - 2*sizeof(long);
+
+ return sizeof(*regs);
+}
+
+#ifdef CONFIG_X86_32
+#define GCC_REALIGN_WORDS 3
+#else
+#define GCC_REALIGN_WORDS 1
+#endif
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2;
+ unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS;
+
+ /*
+ * We have to check for the last task frame at two different locations
+ * because gcc can occasionally decide to realign the stack pointer and
+ * change the offset of the stack frame in the prologue of a function
+ * called by head/entry code. Examples:
+ *
+ * <start_secondary>:
+ * push %edi
+ * lea 0x8(%esp),%edi
+ * and $0xfffffff8,%esp
+ * pushl -0x4(%edi)
+ * push %ebp
+ * mov %esp,%ebp
+ *
+ * <x86_64_start_kernel>:
+ * lea 0x8(%rsp),%r10
+ * and $0xfffffffffffffff0,%rsp
+ * pushq -0x8(%r10)
+ * push %rbp
+ * mov %rsp,%rbp
+ *
+ * Note that after aligning the stack, it pushes a duplicate copy of
+ * the return address before pushing the frame pointer.
+ */
+ return (state->bp == last_bp ||
+ (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+
static bool update_stack_state(struct unwind_state *state, void *addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
+ enum stack_type orig_type = info->type;
/*
* If addr isn't on the current stack, switch to the next one.
@@ -38,31 +153,137 @@ static bool update_stack_state(struct unwind_state *state, void *addr,
&state->stack_mask))
return false;
+ if (!state->orig_sp || info->type != orig_type)
+ state->orig_sp = addr;
+
return true;
}
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long *next_bp;
+ struct pt_regs *regs;
+ unsigned long *next_bp, *next_frame;
+ size_t next_len;
+ enum stack_type prev_type = state->stack_info.type;
if (unwind_done(state))
return false;
- next_bp = (unsigned long *)*state->bp;
+ /* have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread_tls(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * do_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ return true;
+ }
+
+ /* get the next frame pointer */
+ if (state->regs)
+ next_bp = (unsigned long *)state->regs->bp;
+ else
+ next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp);
+
+ /* is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ next_frame = (unsigned long *)regs;
+ next_len = sizeof(*regs);
+ } else {
+ next_frame = next_bp;
+ next_len = FRAME_HEADER_SIZE;
+ }
/* make sure the next frame's data is accessible */
- if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
- return false;
+ if (!update_stack_state(state, next_frame, next_len)) {
+ /*
+ * Don't warn on bad regs->bp. An interrupt in entry code
+ * might cause a false positive warning.
+ */
+ if (state->regs)
+ goto the_end;
+
+ goto bad_address;
+ }
+
+ /* Make sure it only unwinds up and doesn't overlap the last frame: */
+ if (state->stack_info.type == prev_type) {
+ if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs))
+ goto bad_address;
+
+ if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE)
+ goto bad_address;
+ }
/* move to the next frame */
- state->bp = next_bp;
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
return true;
+
+bad_address:
+ /*
+ * When unwinding a non-current task, the task might actually be
+ * running on another CPU, in which case it could be modifying its
+ * stack while we're reading it. This is generally not a problem and
+ * can be ignored as long as the caller understands that unwinding
+ * another task will not always succeed.
+ */
+ if (state->task != current)
+ goto the_end;
+
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_frame);
+ unwind_dump(state, (unsigned long *)state->regs);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_frame);
+ unwind_dump(state, state->bp);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
+ unsigned long *bp, *frame;
+ size_t len;
+
memset(state, 0, sizeof(*state));
state->task = task;
@@ -73,12 +294,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
/* set up the starting stack frame */
- state->bp = get_frame_pointer(task, regs);
+ bp = get_frame_pointer(task, regs);
+ regs = decode_frame_pointer(bp);
+ if (regs) {
+ state->regs = regs;
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ } else {
+ state->bp = bp;
+ frame = bp;
+ len = FRAME_HEADER_SIZE;
+ }
/* initialize stack info and make sure the frame data is accessible */
- get_stack_info(state->bp, state->task, &state->stack_info,
+ get_stack_info(frame, state->task, &state->stack_info,
&state->stack_mask);
- update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+ update_stack_state(state, frame, len);
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 01f30e56f99e..23ee89ce59a9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -35,6 +35,7 @@
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/string.h>
@@ -47,7 +48,7 @@
#include <linux/slab.h>
#include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
@@ -160,11 +161,12 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
static void mark_screen_rdonly(struct mm_struct *mm)
{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- spinlock_t *ptl;
int i;
down_write(&mm->mmap_sem);
@@ -177,7 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pmd = pmd_offset(pud, 0xA0000);
if (pmd_trans_huge(*pmd)) {
- struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ vma = find_vma(mm, 0xA0000);
split_huge_pmd(vma, pmd, 0xA0000);
}
if (pmd_none_or_clear_bad(pmd))
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index dbf67f64d5ec..c74ae9ce8dc4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -91,10 +91,10 @@ SECTIONS
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ _stext = .;
/* bootstrapping code */
HEAD_TEXT
. = ALIGN(8);
- _stext = .;
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
@@ -345,7 +345,6 @@ SECTIONS
DISCARDS
/DISCARD/ : {
*(.eh_frame)
- *(__func_stack_frame_non_standard)
}
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 0bd9f1287f39..11a93f005268 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -89,7 +89,6 @@ struct x86_cpuinit_ops x86_cpuinit = {
};
static void default_nmi_init(void) { };
-static int default_i8042_detect(void) { return 1; };
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu,
@@ -100,7 +99,6 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .i8042_detect = default_i8042_detect,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
};