summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile25
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c217
-rw-r--r--arch/x86/kernel/acpi/cppc.c204
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/madt_playdead.S28
-rw-r--r--arch/x86/kernel/acpi/madt_wakeup.c292
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S25
-rw-r--r--arch/x86/kernel/alternative.c578
-rw-r--r--arch/x86/kernel/amd_gart_64.c4
-rw-r--r--arch/x86/kernel/amd_nb.c262
-rw-r--r--arch/x86/kernel/amd_node.c215
-rw-r--r--arch/x86/kernel/aperture_64.c3
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c390
-rw-r--r--arch/x86/kernel/apic/apic_common.c15
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c137
-rw-r--r--arch/x86/kernel/apic/apic_noop.c6
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c21
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c21
-rw-r--r--arch/x86/kernel/apic/io_apic.c856
-rw-r--r--arch/x86/kernel/apic/local.h5
-rw-r--r--arch/x86/kernel/apic/msi.c8
-rw-r--r--arch/x86/kernel/apic/probe_32.c10
-rw-r--r--arch/x86/kernel/apic/vector.c26
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c12
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c70
-rw-r--r--arch/x86/kernel/apm_32.c31
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/callthunks.c63
-rw-r--r--arch/x86/kernel/cet.c30
-rw-r--r--arch/x86/kernel/cfi.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile19
-rw-r--r--arch/x86/kernel/cpu/acrn.c4
-rw-r--r--arch/x86/kernel/cpu/amd.c537
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c113
-rw-r--r--arch/x86/kernel/cpu/bugs.c466
-rw-r--r--arch/x86/kernel/cpu/bus_lock.c406
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c60
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/common.c633
-rw-r--r--arch/x86/kernel/cpu/cpu.h15
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c11
-rw-r--r--arch/x86/kernel/cpu/cyrix.c4
-rw-r--r--arch/x86/kernel/cpu/debugfs.c49
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c4
-rw-r--r--arch/x86/kernel/cpu/hygon.c132
-rw-r--r--arch/x86/kernel/cpu/intel.c784
-rw-r--r--arch/x86/kernel/cpu/intel_epb.c14
-rw-r--r--arch/x86/kernel/cpu/intel_pconfig.c82
-rw-r--r--arch/x86/kernel/cpu/match.c39
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c247
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c119
-rw-r--r--arch/x86/kernel/cpu/mce/core.c585
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c14
-rw-r--r--arch/x86/kernel/cpu/mce/genpool.c75
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c24
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c336
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h74
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c27
-rw-r--r--arch/x86/kernel/cpu/mce/threshold.c115
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c516
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_shas.c444
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c37
-rw-r--r--arch/x86/kernel/cpu/microcode/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c210
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c10
-rw-r--r--arch/x86/kernel/cpu/proc.c10
-rw-r--r--arch/x86/kernel/cpu/rdrand.c1
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c493
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c262
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h194
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c782
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c112
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c713
-rw-r--r--arch/x86/kernel/cpu/resctrl/trace.h (renamed from arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h)24
-rw-r--r--arch/x86/kernel/cpu/scattered.c55
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c12
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c9
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c49
-rw-r--r--arch/x86/kernel/cpu/topology.c632
-rw-r--r--arch/x86/kernel/cpu/topology.h67
-rw-r--r--arch/x86/kernel/cpu/topology_amd.c220
-rw-r--r--arch/x86/kernel/cpu/topology_common.c255
-rw-r--r--arch/x86/kernel/cpu/topology_ext.c145
-rw-r--r--arch/x86/kernel/cpu/vmware.c229
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c4
-rw-r--r--arch/x86/kernel/crash.c64
-rw-r--r--arch/x86/kernel/devicetree.c53
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/e820.c36
-rw-r--r--arch/x86/kernel/early-quirks.c89
-rw-r--r--arch/x86/kernel/eisa.c11
-rw-r--r--arch/x86/kernel/espfix_64.c8
-rw-r--r--arch/x86/kernel/fpu/bugs.c3
-rw-r--r--arch/x86/kernel/fpu/core.c6
-rw-r--r--arch/x86/kernel/fpu/regset.c3
-rw-r--r--arch/x86/kernel/fpu/signal.c22
-rw-r--r--arch/x86/kernel/fpu/xstate.c141
-rw-r--r--arch/x86/kernel/fpu/xstate.h61
-rw-r--r--arch/x86/kernel/fred.c92
-rw-r--r--arch/x86/kernel/ftrace.c98
-rw-r--r--arch/x86/kernel/ftrace_32.S13
-rw-r--r--arch/x86/kernel/ftrace_64.S17
-rw-r--r--arch/x86/kernel/head64.c214
-rw-r--r--arch/x86/kernel/head_32.S17
-rw-r--r--arch/x86/kernel/head_64.S199
-rw-r--r--arch/x86/kernel/hpet.c28
-rw-r--r--arch/x86/kernel/i8253.c11
-rw-r--r--arch/x86/kernel/idt.c9
-rw-r--r--arch/x86/kernel/irq.c178
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/irqinit.c7
-rw-r--r--arch/x86/kernel/itmt.c81
-rw-r--r--arch/x86/kernel/jailhouse.c31
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c32
-rw-r--r--arch/x86/kernel/kprobes/common.h2
-rw-r--r--arch/x86/kernel/kprobes/core.c113
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c22
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/ksysfs.c18
-rw-r--r--arch/x86/kernel/kvm.c32
-rw-r--r--arch/x86/kernel/kvmclock.c18
-rw-r--r--arch/x86/kernel/ldt.c14
-rw-r--r--arch/x86/kernel/machine_kexec_32.c7
-rw-r--r--arch/x86/kernel/machine_kexec_64.c141
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c1
-rw-r--r--arch/x86/kernel/module.c116
-rw-r--r--arch/x86/kernel/mpparse.c54
-rw-r--r--arch/x86/kernel/nmi.c77
-rw-r--r--arch/x86/kernel/paravirt.c75
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/probe_roms.c10
-rw-r--r--arch/x86/kernel/process.c115
-rw-r--r--arch/x86/kernel/process_32.c7
-rw-r--r--arch/x86/kernel/process_64.c118
-rw-r--r--arch/x86/kernel/reboot.c26
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S234
-rw-r--r--arch/x86/kernel/rtc.c3
-rw-r--r--arch/x86/kernel/setup.c99
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/sev-shared.c1172
-rw-r--r--arch/x86/kernel/sev.c2264
-rw-r--r--arch/x86/kernel/sev_verify_cbit.S2
-rw-r--r--arch/x86/kernel/shstk.c20
-rw-r--r--arch/x86/kernel/signal.c30
-rw-r--r--arch/x86/kernel/signal_32.c2
-rw-r--r--arch/x86/kernel/signal_64.c12
-rw-r--r--arch/x86/kernel/smp.c12
-rw-r--r--arch/x86/kernel/smpboot.c319
-rw-r--r--arch/x86/kernel/static_call.c10
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c56
-rw-r--r--arch/x86/kernel/time.c20
-rw-r--r--arch/x86/kernel/topology.c72
-rw-r--r--arch/x86/kernel/traps.c130
-rw-r--r--arch/x86/kernel/tsc.c150
-rw-r--r--arch/x86/kernel/tsc_msr.c14
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/unwind_orc.c2
-rw-r--r--arch/x86/kernel/uprobes.c124
-rw-r--r--arch/x86/kernel/vm86_32.c5
-rw-r--r--arch/x86/kernel/vmcore_info_32.c (renamed from arch/x86/kernel/crash_core_32.c)2
-rw-r--r--arch/x86/kernel/vmcore_info_64.c (renamed from arch/x86/kernel/crash_core_64.c)2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S125
-rw-r--r--arch/x86/kernel/vsmp_64.c13
-rw-r--r--arch/x86/kernel/x86_init.c16
171 files changed, 10796 insertions, 10724 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0000325ab98f..b43eb7e384eb 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -17,7 +17,6 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
CFLAGS_REMOVE_head32.o = -pg
-CFLAGS_REMOVE_sev.o = -pg
CFLAGS_REMOVE_rethook.o = -pg
endif
@@ -26,7 +25,6 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
-KASAN_SANITIZE_sev.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -37,9 +35,16 @@ KMSAN_SANITIZE_nmi.o := n
# If instrumentation of the following files is enabled, boot hangs during
# first second.
KCOV_INSTRUMENT_head$(BITS).o := n
-KCOV_INSTRUMENT_sev.o := n
+# These are called from save_stack_trace() on debug paths,
+# and produce large amounts of uninteresting coverage.
+KCOV_INSTRUMENT_stacktrace.o := n
+KCOV_INSTRUMENT_dumpstack.o := n
+KCOV_INSTRUMENT_dumpstack_$(BITS).o := n
+KCOV_INSTRUMENT_unwind_orc.o := n
+KCOV_INSTRUMENT_unwind_frame.o := n
+KCOV_INSTRUMENT_unwind_guess.o := n
-CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
+CFLAGS_irq.o := -I $(src)/../include/asm/trace
obj-y += head_$(BITS).o
obj-y += head$(BITS).o
@@ -48,6 +53,7 @@ obj-y += platform-quirks.o
obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED) += fred.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-$(CONFIG_X86_KERNEL_IBT) += ibt_selftest.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
@@ -60,7 +66,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
-obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
+obj-y += pci-dma.o quirks.o kdebugfs.o
obj-y += alternative.o i8253.o hw_breakpoint.o
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += resource.o
@@ -98,11 +104,11 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_RETHOOK) += rethook.o
-obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
+obj-$(CONFIG_VMCORE_INFO) += vmcore_info_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
-obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o crash.o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_X86_32) += doublefault_32.o
@@ -113,6 +119,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
+obj-$(CONFIG_AMD_NODE) += amd_node.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
@@ -140,8 +147,6 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
-obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
-
obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_CALL_THUNKS) += callthunks.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fc17b3f136fe..842a5f449404 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o
+obj-$(CONFIG_ACPI_MADT_WAKEUP) += madt_wakeup.o madt_playdead.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 85a3ce2a3666..dae6a73be40e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,13 +67,6 @@ static bool has_lapic_cpus __initdata;
static bool acpi_support_online_capable;
#endif
-#ifdef CONFIG_X86_64
-/* Physical address of the Multiprocessor Wakeup Structure mailbox */
-static u64 acpi_mp_wake_mailbox_paddr;
-/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
-static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
-#endif
-
#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
@@ -164,35 +157,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return 0;
}
-/**
- * acpi_register_lapic - register a local apic and generates a logic cpu number
- * @id: local apic id to register
- * @acpiid: ACPI id to register
- * @enabled: this cpu is enabled or not
- *
- * Returns the logic cpu number which maps to the local apic
- */
-static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
-{
- int cpu;
-
- if (id >= MAX_LOCAL_APIC) {
- pr_info("skipped apicid that is too big\n");
- return -EINVAL;
- }
-
- if (!enabled) {
- ++disabled_cpus;
- return -EINVAL;
- }
-
- cpu = generic_processor_info(id);
- if (cpu >= 0)
- early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
-
- return cpu;
-}
-
static bool __init acpi_is_processor_usable(u32 lapic_flags)
{
if (lapic_flags & ACPI_MADT_ENABLED)
@@ -254,7 +218,7 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
return 0;
}
- acpi_register_lapic(apic_id, processor->uid, enabled);
+ topology_register_apic(apic_id, processor->uid, enabled);
#else
pr_warn("x2apic entry ignored\n");
#endif
@@ -263,6 +227,28 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
}
static int __init
+acpi_check_lapic(union acpi_subtable_headers *header, const unsigned long end)
+{
+ struct acpi_madt_local_apic *processor = NULL;
+
+ processor = (struct acpi_madt_local_apic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
+ /* Ignore processors that can not be onlined */
+ if (!acpi_is_processor_usable(processor->lapic_flags))
+ return 0;
+
+ has_lapic_cpus = true;
+ return 0;
+}
+
+static int __init
acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
{
struct acpi_madt_local_apic *processor = NULL;
@@ -289,11 +275,10 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->id, /* APIC ID */
- processor->processor_id, /* ACPI ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic(processor->id, /* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
- has_lapic_cpus = true;
return 0;
}
@@ -309,9 +294,9 @@ acpi_parse_sapic(union acpi_subtable_headers *header, const unsigned long end)
acpi_table_print_madt_entry(&header->common);
- acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
- processor->processor_id, /* ACPI ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ topology_register_apic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->processor_id, /* ACPI ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
@@ -370,60 +355,6 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
-{
- /*
- * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
- *
- * Wakeup of secondary CPUs is fully serialized in the core code.
- * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
- */
- if (!acpi_mp_wake_mailbox) {
- acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
- sizeof(*acpi_mp_wake_mailbox),
- MEMREMAP_WB);
- }
-
- /*
- * Mailbox memory is shared between the firmware and OS. Firmware will
- * listen on mailbox command address, and once it receives the wakeup
- * command, the CPU associated with the given apicid will be booted.
- *
- * The value of 'apic_id' and 'wakeup_vector' must be visible to the
- * firmware before the wakeup command is visible. smp_store_release()
- * ensures ordering and visibility.
- */
- acpi_mp_wake_mailbox->apic_id = apicid;
- acpi_mp_wake_mailbox->wakeup_vector = start_ip;
- smp_store_release(&acpi_mp_wake_mailbox->command,
- ACPI_MP_WAKE_COMMAND_WAKEUP);
-
- /*
- * Wait for the CPU to wake up.
- *
- * The CPU being woken up is essentially in a spin loop waiting to be
- * woken up. It should not take long for it wake up and acknowledge by
- * zeroing out ->command.
- *
- * ACPI specification doesn't provide any guidance on how long kernel
- * has to wait for a wake up acknowledgement. It also doesn't provide
- * a way to cancel a wake up request if it takes too long.
- *
- * In TDX environment, the VMM has control over how long it takes to
- * wake up secondary. It can postpone scheduling secondary vCPU
- * indefinitely. Giving up on wake up request and reporting error opens
- * possible attack vector for VMM: it can wake up a secondary CPU when
- * kernel doesn't expect it. Wait until positive result of the wake up
- * request.
- */
- while (READ_ONCE(acpi_mp_wake_mailbox->command))
- cpu_relax();
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -844,12 +775,10 @@ static int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
return 0;
}
-int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id,
- int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu)
{
- int cpu;
+ int cpu = topology_hotplug_apic(physid, acpi_id);
- cpu = acpi_register_lapic(physid, acpi_id, ACPI_MADT_ENABLED);
if (cpu < 0) {
pr_info("Unable to map lapic to logical cpu number\n");
return cpu;
@@ -868,15 +797,11 @@ int acpi_unmap_cpu(int cpu)
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
#endif
-
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
- set_cpu_present(cpu, false);
- num_processors--;
-
- return (0);
+ topology_hotunplug_apic(cpu);
+ return 0;
}
EXPORT_SYMBOL(acpi_unmap_cpu);
-#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
{
@@ -1007,11 +932,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
* the resource tree during the lateinit timeframe.
*/
#define HPET_RESOURCE_NAME_SIZE 9
- hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
+ hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
SMP_CACHE_BYTES);
- if (!hpet_res)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
@@ -1125,6 +1047,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
static int __init acpi_parse_madt_lapic_entries(void)
{
int count, x2count = 0;
+ struct acpi_subtable_proc madt_proc[2];
+ int ret;
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
@@ -1133,10 +1057,27 @@ static int __init acpi_parse_madt_lapic_entries(void)
acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_LOCAL_APIC);
- x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_LOCAL_APIC);
+ /* Check if there are valid LAPIC entries */
+ acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_check_lapic, MAX_LOCAL_APIC);
+
+ /*
+ * Enumerate the APIC IDs in the order that they appear in the
+ * MADT, no matter LAPIC entry or x2APIC entry is used.
+ */
+ memset(madt_proc, 0, sizeof(madt_proc));
+ madt_proc[0].id = ACPI_MADT_TYPE_LOCAL_APIC;
+ madt_proc[0].handler = acpi_parse_lapic;
+ madt_proc[1].id = ACPI_MADT_TYPE_LOCAL_X2APIC;
+ madt_proc[1].handler = acpi_parse_x2apic;
+ ret = acpi_table_parse_entries_array(ACPI_SIG_MADT,
+ sizeof(struct acpi_table_madt),
+ madt_proc, ARRAY_SIZE(madt_proc), MAX_LOCAL_APIC);
+ if (ret < 0) {
+ pr_err("Error parsing LAPIC/X2APIC entries\n");
+ return ret;
+ }
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
pr_err("No LAPIC entries present\n");
@@ -1159,29 +1100,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
}
return 0;
}
-
-#ifdef CONFIG_X86_64
-static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
- const unsigned long end)
-{
- struct acpi_madt_multiproc_wakeup *mp_wake;
-
- if (!IS_ENABLED(CONFIG_SMP))
- return -ENODEV;
-
- mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
- if (BAD_MADT_ENTRY(mp_wake, end))
- return -EINVAL;
-
- acpi_table_print_madt_entry(&header->common);
-
- acpi_mp_wake_mailbox_paddr = mp_wake->base_address;
-
- apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
-
- return 0;
-}
-#endif /* CONFIG_X86_64 */
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
@@ -1290,7 +1208,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
}
count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
- acpi_parse_int_src_ovr, nr_irqs);
+ acpi_parse_int_src_ovr,
+ irq_get_nr_irqs());
if (count < 0) {
pr_err("Error parsing interrupt source overrides entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1310,7 +1229,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
mp_config_acpi_legacy_irqs();
count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
- acpi_parse_nmi_src, nr_irqs);
+ acpi_parse_nmi_src,
+ irq_get_nr_irqs());
if (count < 0) {
pr_err("Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -1378,7 +1298,7 @@ static void __init acpi_process_madt(void)
smp_found_config = 1;
}
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_ACPI_MADT_WAKEUP
/*
* Parse MADT MP Wake entry.
*/
@@ -1897,3 +1817,14 @@ u64 x86_default_get_root_pointer(void)
{
return boot_params.acpi_rsdp_addr;
}
+
+#ifdef CONFIG_XEN_PV
+void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+ return ioremap_cache(phys, size);
+}
+
+void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) =
+ x86_acpi_os_ioremap;
+EXPORT_SYMBOL_GPL(acpi_os_ioremap);
+#endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 8d8752b44f11..d745dd586303 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,17 @@
#include <asm/processor.h>
#include <asm/topology.h>
+#define CPPC_HIGHEST_PERF_PERFORMANCE 196
+#define CPPC_HIGHEST_PERF_PREFCORE 166
+
+enum amd_pref_core {
+ AMD_PREF_CORE_UNKNOWN = 0,
+ AMD_PREF_CORE_SUPPORTED,
+ AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
bool cpc_supported_by_cpu(void)
@@ -20,7 +31,7 @@ bool cpc_supported_by_cpu(void)
(boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
return true;
else if (boot_cpu_data.x86 == 0x17 &&
- boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
+ boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
return true;
return boot_cpu_has(X86_FEATURE_CPPC);
}
@@ -69,38 +80,37 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
static void amd_set_max_freq_ratio(void)
{
struct cppc_perf_caps perf_caps;
- u64 highest_perf, nominal_perf;
+ u64 numerator, nominal_perf;
u64 perf_ratio;
int rc;
rc = cppc_get_perf_caps(0, &perf_caps);
if (rc) {
- pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ pr_warn("Could not retrieve perf counters (%d)\n", rc);
return;
}
- highest_perf = amd_get_highest_perf();
+ rc = amd_get_boost_ratio_numerator(0, &numerator);
+ if (rc) {
+ pr_warn("Could not retrieve highest performance (%d)\n", rc);
+ return;
+ }
nominal_perf = perf_caps.nominal_perf;
- if (!highest_perf || !nominal_perf) {
- pr_debug("Could not retrieve highest or nominal performance\n");
+ if (!nominal_perf) {
+ pr_warn("Could not retrieve nominal performance\n");
return;
}
- perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
/* midpoint between max_boost and max_P */
- perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
- if (!perf_ratio) {
- pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
- return;
- }
+ perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
freq_invariance_set_perf_ratio(perf_ratio, false);
}
static DEFINE_MUTEX(freq_invariance_lock);
-void init_freq_invariance_cppc(void)
+static inline void init_freq_invariance_cppc(void)
{
static bool init_done;
@@ -116,3 +126,171 @@ void init_freq_invariance_cppc(void)
init_done = true;
mutex_unlock(&freq_invariance_lock);
}
+
+void acpi_processor_init_invariance_cppc(void)
+{
+ init_freq_invariance_cppc();
+}
+
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+ u64 val;
+ int ret;
+
+ if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+ if (ret)
+ goto out;
+
+ val = AMD_CPPC_HIGHEST_PERF(val);
+ } else {
+ ret = cppc_get_highest_perf(cpu, &val);
+ if (ret)
+ goto out;
+ }
+
+ WRITE_ONCE(*highest_perf, (u32)val);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+ int cpu, count = 0;
+ u64 highest_perf[2] = {0};
+
+ if (WARN_ON(!detected))
+ return -EINVAL;
+
+ switch (amd_pref_core_detected) {
+ case AMD_PREF_CORE_SUPPORTED:
+ *detected = true;
+ return 0;
+ case AMD_PREF_CORE_UNSUPPORTED:
+ *detected = false;
+ return 0;
+ default:
+ break;
+ }
+
+ for_each_present_cpu(cpu) {
+ u32 tmp;
+ int ret;
+
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+
+ if (!count || (count == 1 && tmp != highest_perf[0]))
+ highest_perf[count++] = tmp;
+
+ if (count == 2)
+ break;
+ }
+
+ *detected = (count == 2);
+ boost_numerator = highest_perf[0];
+
+ amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+ AMD_PREF_CORE_UNSUPPORTED;
+
+ pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+ *detected ? "" : "un", highest_perf[0]);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu));
+ bool prefcore;
+ int ret;
+ u32 tmp;
+
+ ret = amd_detect_prefcore(&prefcore);
+ if (ret)
+ return ret;
+
+ /* without preferred cores, return the highest perf register value */
+ if (!prefcore) {
+ *numerator = boost_numerator;
+ return 0;
+ }
+
+ /*
+ * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+ * the highest performance level is set to 196.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+ */
+ if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+ switch (boot_cpu_data.x86_model) {
+ case 0x70 ... 0x7f:
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ default:
+ break;
+ }
+ }
+
+ /* detect if running on heterogeneous design */
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) {
+ switch (core_type) {
+ case TOPO_CPU_TYPE_UNKNOWN:
+ pr_warn("Undefined core type found for cpu %d\n", cpu);
+ break;
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ /* use the max scale for performance cores */
+ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+ return 0;
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ /* use the highest perf value for efficiency cores */
+ ret = amd_get_highest_perf(cpu, &tmp);
+ if (ret)
+ return ret;
+ *numerator = tmp;
+ return 0;
+ }
+ }
+
+ *numerator = CPPC_HIGHEST_PERF_PREFCORE;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 401808b47af3..5854f0b8f0f1 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
#include <linux/sched.h>
#include <acpi/processor.h>
+#include <asm/cpuid.h>
#include <asm/mwait.h>
#include <asm/special_insns.h>
@@ -128,11 +129,11 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
unsigned int num_cstate_subtype;
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/* Check whether this particular cx_type (in CST) is supported or not */
- cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) &
- MWAIT_CSTATE_MASK) + 1;
+ cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
+ MWAIT_CSTATE_MASK) + 1) & MWAIT_CSTATE_MASK;
edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
@@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
struct cpuinfo_x86 *c = &cpu_data(cpu);
long retval;
- if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
+ if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
return -1;
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
diff --git a/arch/x86/kernel/acpi/madt_playdead.S b/arch/x86/kernel/acpi/madt_playdead.S
new file mode 100644
index 000000000000..4e498d28cdc8
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_playdead.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/linkage.h>
+#include <asm/nospec-branch.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+
+ .text
+ .align PAGE_SIZE
+
+/*
+ * asm_acpi_mp_play_dead() - Hand over control of the CPU to the BIOS
+ *
+ * rdi: Address of the ACPI MADT MPWK ResetVector
+ * rsi: PGD of the identity mapping
+ */
+SYM_FUNC_START(asm_acpi_mp_play_dead)
+ /* Turn off global entries. Following CR3 write will flush them. */
+ movq %cr4, %rdx
+ andq $~(X86_CR4_PGE), %rdx
+ movq %rdx, %cr4
+
+ /* Switch to identity mapping */
+ movq %rsi, %cr3
+
+ /* Jump to reset vector */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rdi
+SYM_FUNC_END(asm_acpi_mp_play_dead)
diff --git a/arch/x86/kernel/acpi/madt_wakeup.c b/arch/x86/kernel/acpi/madt_wakeup.c
new file mode 100644
index 000000000000..d5ef6215583b
--- /dev/null
+++ b/arch/x86/kernel/acpi/madt_wakeup.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/acpi.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/pgtable.h>
+#include <linux/sched/hotplug.h>
+#include <asm/apic.h>
+#include <asm/barrier.h>
+#include <asm/init.h>
+#include <asm/intel_pt.h>
+#include <asm/nmi.h>
+#include <asm/processor.h>
+#include <asm/reboot.h>
+
+/* Physical address of the Multiprocessor Wakeup Structure mailbox */
+static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;
+
+/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
+static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;
+
+static u64 acpi_mp_pgd __ro_after_init;
+static u64 acpi_mp_reset_vector_paddr __ro_after_init;
+
+static void acpi_mp_stop_this_cpu(void)
+{
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_play_dead(void)
+{
+ play_dead_common();
+ asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
+}
+
+static void acpi_mp_cpu_die(unsigned int cpu)
+{
+ u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ unsigned long timeout;
+
+ /*
+ * Use TEST mailbox command to prove that BIOS got control over
+ * the CPU before declaring it dead.
+ *
+ * BIOS has to clear 'command' field of the mailbox.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_TEST);
+
+ /* Don't wait longer than a second. */
+ timeout = USEC_PER_SEC;
+ while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
+ udelay(1);
+
+ if (!timeout)
+ pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
+}
+
+/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static void __init free_pgt_page(void *pgt, void *dummy)
+{
+ return memblock_free(pgt, PAGE_SIZE);
+}
+
+/*
+ * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
+ * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
+ * to the identity mapping and the function has be present at the same spot in
+ * the virtual address space before and after switching page tables.
+ */
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)asm_acpi_mp_play_dead;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+static int __init acpi_mp_setup_reset(u64 reset_vector)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+ .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+ pgd_t *pgd;
+
+ pgd = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ if (kernel_ident_mapping_init(&info, pgd,
+ PAGE_ALIGN_DOWN(reset_vector),
+ PAGE_ALIGN(reset_vector + 1))) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ if (init_transition_pgtable(pgd)) {
+ kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+
+ smp_ops.play_dead = acpi_mp_play_dead;
+ smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
+ smp_ops.cpu_die = acpi_mp_cpu_die;
+
+ acpi_mp_reset_vector_paddr = reset_vector;
+ acpi_mp_pgd = __pa(pgd);
+
+ return 0;
+}
+
+static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
+{
+ if (!acpi_mp_wake_mailbox_paddr) {
+ pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
+ *
+ * Wakeup of secondary CPUs is fully serialized in the core code.
+ * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
+ */
+ if (!acpi_mp_wake_mailbox) {
+ acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
+ sizeof(*acpi_mp_wake_mailbox),
+ MEMREMAP_WB);
+ }
+
+ /*
+ * Mailbox memory is shared between the firmware and OS. Firmware will
+ * listen on mailbox command address, and once it receives the wakeup
+ * command, the CPU associated with the given apicid will be booted.
+ *
+ * The value of 'apic_id' and 'wakeup_vector' must be visible to the
+ * firmware before the wakeup command is visible. smp_store_release()
+ * ensures ordering and visibility.
+ */
+ acpi_mp_wake_mailbox->apic_id = apicid;
+ acpi_mp_wake_mailbox->wakeup_vector = start_ip;
+ smp_store_release(&acpi_mp_wake_mailbox->command,
+ ACPI_MP_WAKE_COMMAND_WAKEUP);
+
+ /*
+ * Wait for the CPU to wake up.
+ *
+ * The CPU being woken up is essentially in a spin loop waiting to be
+ * woken up. It should not take long for it wake up and acknowledge by
+ * zeroing out ->command.
+ *
+ * ACPI specification doesn't provide any guidance on how long kernel
+ * has to wait for a wake up acknowledgment. It also doesn't provide
+ * a way to cancel a wake up request if it takes too long.
+ *
+ * In TDX environment, the VMM has control over how long it takes to
+ * wake up secondary. It can postpone scheduling secondary vCPU
+ * indefinitely. Giving up on wake up request and reporting error opens
+ * possible attack vector for VMM: it can wake up a secondary CPU when
+ * kernel doesn't expect it. Wait until positive result of the wake up
+ * request.
+ */
+ while (READ_ONCE(acpi_mp_wake_mailbox->command))
+ cpu_relax();
+
+ return 0;
+}
+
+static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
+{
+ cpu_hotplug_disable_offlining();
+
+ /*
+ * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
+ * limits kexec: the second kernel won't be able to use more than one CPU.
+ *
+ * To prevent a kexec kernel from onlining secondary CPUs invalidate the
+ * mailbox address in the ACPI MADT wakeup structure which prevents a
+ * kexec kernel to use it.
+ *
+ * This is safe as the booting kernel has the mailbox address cached
+ * already and acpi_wakeup_cpu() uses the cached value to bring up the
+ * secondary CPUs.
+ *
+ * Note: This is a Linux specific convention and not covered by the
+ * ACPI specification.
+ */
+ mp_wake->mailbox_address = 0;
+}
+
+int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
+ const unsigned long end)
+{
+ struct acpi_madt_multiproc_wakeup *mp_wake;
+
+ mp_wake = (struct acpi_madt_multiproc_wakeup *)header;
+
+ /*
+ * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
+ * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
+ * than the actual size of the MP wakeup entry in ACPI table because the
+ * 'reset_vector' is only available in the V1 MP wakeup structure.
+ */
+ if (!mp_wake)
+ return -EINVAL;
+ if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+ if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(&header->common);
+
+ acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;
+
+ if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
+ mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
+ if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
+ pr_warn("Failed to setup MADT reset vector\n");
+ acpi_mp_disable_offlining(mp_wake);
+ }
+ } else {
+ /*
+ * CPU offlining requires version 1 of the ACPI MADT wakeup
+ * structure.
+ */
+ acpi_mp_disable_offlining(mp_wake);
+ }
+
+ apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index d5d8a352eafa..b200a193beeb 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -17,7 +17,7 @@
* Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
SYM_FUNC_START(wakeup_long64)
- movq saved_magic, %rax
+ movq saved_magic(%rip), %rax
movq $0x123456789abcdef0, %rdx
cmpq %rdx, %rax
je 2f
@@ -33,14 +33,14 @@ SYM_FUNC_START(wakeup_long64)
movw %ax, %es
movw %ax, %fs
movw %ax, %gs
- movq saved_rsp, %rsp
+ movq saved_rsp(%rip), %rsp
- movq saved_rbx, %rbx
- movq saved_rdi, %rdi
- movq saved_rsi, %rsi
- movq saved_rbp, %rbp
+ movq saved_rbx(%rip), %rbx
+ movq saved_rdi(%rip), %rdi
+ movq saved_rsi(%rip), %rsi
+ movq saved_rbp(%rip), %rbp
- movq saved_rip, %rax
+ movq saved_rip(%rip), %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -72,11 +72,11 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq $.Lresume_point, saved_rip(%rip)
- movq %rsp, saved_rsp
- movq %rbp, saved_rbp
- movq %rbx, saved_rbx
- movq %rdi, saved_rdi
- movq %rsi, saved_rsi
+ movq %rsp, saved_rsp(%rip)
+ movq %rbp, saved_rbp(%rip)
+ movq %rbx, saved_rbx(%rip)
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
addq $8, %rsp
movl $3, %edi
@@ -87,6 +87,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
.align 4
.Lresume_point:
+ ANNOTATE_NOENDBR
/* We don't restore %rax, it must be 0 anyway */
movq $saved_context, %rax
movq saved_context_cr4(%rax), %rbx
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aae7456ece07..c71b575bf229 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -30,6 +30,7 @@
#include <asm/fixmap.h>
#include <asm/paravirt.h>
#include <asm/asm-prototypes.h>
+#include <asm/cfi.h>
int __read_mostly alternatives_patched;
@@ -44,7 +45,7 @@ EXPORT_SYMBOL_GPL(alternatives_patched);
#define DA_ENDBR 0x08
#define DA_SMP 0x10
-static unsigned int __initdata_or_module debug_alternative;
+static unsigned int debug_alternative;
static int __init debug_alt(char *str)
{
@@ -124,6 +125,20 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
};
/*
+ * Nomenclature for variable names to simplify and clarify this code and ease
+ * any potential staring at it:
+ *
+ * @instr: source address of the original instructions in the kernel text as
+ * generated by the compiler.
+ *
+ * @buf: temporary buffer on which the patching operates. This buffer is
+ * eventually text-poked into the kernel image.
+ *
+ * @replacement/@repl: pointer to the opcodes which are replacing @instr, located
+ * in the .altinstr_replacement section.
+ */
+
+/*
* Fill the buffer with a single effective instruction of size @len.
*
* In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
@@ -132,35 +147,34 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
* each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
* *jump* over instead of executing long and daft NOPs.
*/
-static void __init_or_module add_nop(u8 *instr, unsigned int len)
+static void add_nop(u8 *buf, unsigned int len)
{
- u8 *target = instr + len;
+ u8 *target = buf + len;
if (!len)
return;
if (len <= ASM_NOP_MAX) {
- memcpy(instr, x86_nops[len], len);
+ memcpy(buf, x86_nops[len], len);
return;
}
if (len < 128) {
- __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
- instr += JMP8_INSN_SIZE;
+ __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE);
+ buf += JMP8_INSN_SIZE;
} else {
- __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
- instr += JMP32_INSN_SIZE;
+ __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE);
+ buf += JMP32_INSN_SIZE;
}
- for (;instr < target; instr++)
- *instr = INT3_INSN_OPCODE;
+ for (;buf < target; buf++)
+ *buf = INT3_INSN_OPCODE;
}
extern s32 __retpoline_sites[], __retpoline_sites_end[];
extern s32 __return_sites[], __return_sites_end[];
extern s32 __cfi_sites[], __cfi_sites_end[];
extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void text_poke_early(void *addr, const void *opcode, size_t len);
@@ -187,12 +201,12 @@ static bool insn_is_nop(struct insn *insn)
* Find the offset of the first non-NOP instruction starting at @offset
* but no further than @len.
*/
-static int skip_nops(u8 *instr, int offset, int len)
+static int skip_nops(u8 *buf, int offset, int len)
{
struct insn insn;
for (; offset < len; offset += insn.length) {
- if (insn_decode_kernel(&insn, &instr[offset]))
+ if (insn_decode_kernel(&insn, &buf[offset]))
break;
if (!insn_is_nop(&insn))
@@ -203,66 +217,32 @@ static int skip_nops(u8 *instr, int offset, int len)
}
/*
- * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
- * to the end of the NOP sequence into a single NOP.
- */
-static bool __init_or_module
-__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
-{
- int i = *next - insn->length;
-
- switch (insn->opcode.bytes[0]) {
- case JMP8_INSN_OPCODE:
- case JMP32_INSN_OPCODE:
- *prev = i;
- *target = *next + insn->immediate.value;
- return false;
- }
-
- if (insn_is_nop(insn)) {
- int nop = i;
-
- *next = skip_nops(instr, *next, len);
- if (*target && *next == *target)
- nop = *prev;
-
- add_nop(instr + nop, *next - nop);
- DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
- return true;
- }
-
- *target = 0;
- return false;
-}
-
-/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
-static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len)
{
- int prev, target = 0;
-
for (int next, i = 0; i < len; i = next) {
struct insn insn;
- if (insn_decode_kernel(&insn, &instr[i]))
+ if (insn_decode_kernel(&insn, &buf[i]))
return;
next = i + insn.length;
- __optimize_nops(instr, len, &insn, &next, &prev, &target);
- }
-}
+ if (insn_is_nop(&insn)) {
+ int nop = i;
-static void __init_or_module noinline optimize_nops_inplace(u8 *instr, size_t len)
-{
- unsigned long flags;
+ /* Has the NOP already been optimized? */
+ if (i + insn.length == len)
+ return;
- local_irq_save(flags);
- optimize_nops(instr, len);
- sync_core();
- local_irq_restore(flags);
+ next = skip_nops(buf, next, len);
+
+ add_nop(buf + nop, next - nop);
+ DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next);
+ }
+ }
}
/*
@@ -335,12 +315,9 @@ bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
return (target < src || target > src + src_len);
}
-static void __init_or_module noinline
-apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
{
- int prev, target = 0;
-
- for (int next, i = 0; i < len; i = next) {
+ for (int next, i = 0; i < instrlen; i = next) {
struct insn insn;
if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
@@ -348,9 +325,6 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
next = i + insn.length;
- if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
- continue;
-
switch (insn.opcode.bytes[0]) {
case 0x0f:
if (insn.opcode.bytes[1] < 0x80 ||
@@ -362,10 +336,10 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
case JMP8_INSN_OPCODE:
case JMP32_INSN_OPCODE:
case CALL_INSN_OPCODE:
- if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ if (need_reloc(next + insn.immediate.value, repl, repl_len)) {
apply_reloc(insn.immediate.nbytes,
buf + i + insn_offset_immediate(&insn),
- src - dest);
+ repl - instr);
}
/*
@@ -373,7 +347,7 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
*/
if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
s32 imm = insn.immediate.value;
- imm += src - dest;
+ imm += repl - instr;
imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
if ((imm >> 31) == (imm >> 7)) {
buf[i+0] = JMP8_INSN_OPCODE;
@@ -386,15 +360,85 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
}
if (insn_rip_relative(&insn)) {
- if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ if (need_reloc(next + insn.displacement.value, repl, repl_len)) {
apply_reloc(insn.displacement.nbytes,
buf + i + insn_offset_displacement(&insn),
- src - dest);
+ repl - instr);
}
}
}
}
+void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len)
+{
+ __apply_relocation(buf, instr, instrlen, repl, repl_len);
+ optimize_nops(instr, buf, instrlen);
+}
+
+/* Low-level backend functions usable from alternative code replacements. */
+DEFINE_ASM_FUNC(nop_func, "", .entry.text);
+EXPORT_SYMBOL_GPL(nop_func);
+
+noinstr void BUG_func(void)
+{
+ BUG();
+}
+EXPORT_SYMBOL(BUG_func);
+
+#define CALL_RIP_REL_OPCODE 0xff
+#define CALL_RIP_REL_MODRM 0x15
+
+/*
+ * Rewrite the "call BUG_func" replacement to point to the target of the
+ * indirect pv_ops call "call *disp(%ip)".
+ */
+static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a,
+ struct module *mod)
+{
+ u8 *wr_instr = module_writable_address(mod, instr);
+ void *target, *bug = &BUG_func;
+ s32 disp;
+
+ if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n");
+ BUG();
+ }
+
+ if (a->instrlen != 6 ||
+ wr_instr[0] != CALL_RIP_REL_OPCODE ||
+ wr_instr[1] != CALL_RIP_REL_MODRM) {
+ pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n");
+ BUG();
+ }
+
+ /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */
+ disp = *(s32 *)(wr_instr + 2);
+#ifdef CONFIG_X86_64
+ /* ff 15 00 00 00 00 call *0x0(%rip) */
+ /* target address is stored at "next instruction + disp". */
+ target = *(void **)(instr + a->instrlen + disp);
+#else
+ /* ff 15 00 00 00 00 call *0x0 */
+ /* target address is stored at disp. */
+ target = *(void **)disp;
+#endif
+ if (!target)
+ target = bug;
+
+ /* (BUG_func - .) + (target - BUG_func) := target - . */
+ *(s32 *)(insn_buff + 1) += target - bug;
+
+ if (target == &nop_func)
+ return 0;
+
+ return 5;
+}
+
+static inline u8 * instr_va(struct alt_instr *i)
+{
+ return (u8 *)&i->instr_offset + i->instr_offset;
+}
+
/*
* Replace instructions with better alternatives for this CPU type. This runs
* before SMP is initialized to avoid SMP problems with self modifying code.
@@ -406,11 +450,12 @@ apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
* to refetch changed I$ lines.
*/
void __init_or_module noinline apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+ struct alt_instr *end,
+ struct module *mod)
{
- struct alt_instr *a;
- u8 *instr, *replacement;
u8 insn_buff[MAX_PATCH_LEN];
+ u8 *instr, *replacement;
+ struct alt_instr *a, *b;
DPRINTK(ALT, "alt table %px, -> %px", start, end);
@@ -435,9 +480,25 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
+ u8 *wr_instr, *wr_replacement;
+
+ /*
+ * In case of nested ALTERNATIVE()s the outer alternative might
+ * add more padding. To ensure consistent patching find the max
+ * padding for all alt_instr entries for this site (nested
+ * alternatives result in consecutive entries).
+ */
+ for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) {
+ u8 len = max(a->instrlen, b->instrlen);
+ a->instrlen = b->instrlen = len;
+ }
+
+ instr = instr_va(a);
+ wr_instr = module_writable_address(mod, instr);
- instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
+ wr_replacement = module_writable_address(mod, replacement);
+
BUG_ON(a->instrlen > sizeof(insn_buff));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
@@ -448,30 +509,38 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
* patch if feature is *NOT* present.
*/
if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
- optimize_nops_inplace(instr, a->instrlen);
+ memcpy(insn_buff, wr_instr, a->instrlen);
+ optimize_nops(instr, insn_buff, a->instrlen);
+ text_poke_early(wr_instr, insn_buff, a->instrlen);
continue;
}
- DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
- (a->flags & ALT_FLAG_NOT) ? "!" : "",
+ DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x",
a->cpuid >> 5,
a->cpuid & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen);
+ replacement, a->replacementlen, a->flags);
- memcpy(insn_buff, replacement, a->replacementlen);
+ memcpy(insn_buff, wr_replacement, a->replacementlen);
insn_buff_sz = a->replacementlen;
+ if (a->flags & ALT_FLAG_DIRECT_CALL) {
+ insn_buff_sz = alt_replace_call(instr, insn_buff, a,
+ mod);
+ if (insn_buff_sz < 0)
+ continue;
+ }
+
for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
insn_buff[insn_buff_sz] = 0x90;
- apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
+ apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen);
- DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr);
+ DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
- text_poke_early(instr, insn_buff, insn_buff_sz);
+ text_poke_early(wr_instr, insn_buff, insn_buff_sz);
}
kasan_enable_current();
@@ -483,7 +552,7 @@ static inline bool is_jcc32(struct insn *insn)
return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
}
-#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
+#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL)
/*
* CALL/JMP *%\reg
@@ -647,8 +716,8 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* The compiler is supposed to EMIT an INT3 after every unconditional
* JMP instruction due to AMD BTC. However, if the compiler is too old
- * or SLS isn't enabled, we still need an INT3 after indirect JMPs
- * even on Intel.
+ * or MITIGATION_SLS isn't enabled, we still need an INT3 after
+ * indirect JMPs even on Intel.
*/
if (op == JMP32_INSN_OPCODE && i < insn->length)
bytes[i++] = INT3_INSN_OPCODE;
@@ -662,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
/*
* Generated by 'objtool --retpoline'.
*/
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
+ struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op1, op2;
- ret = insn_decode_kernel(&insn, addr);
+ ret = insn_decode_kernel(&insn, wr_addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -700,15 +771,15 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
len = patch_retpoline(addr, &insn, bytes);
if (len == insn.length) {
- optimize_nops(bytes, len);
- DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr);
+ optimize_nops(addr, bytes, len);
+ DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr);
DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(addr, bytes, len);
+ text_poke_early(wr_addr, bytes, len);
}
}
}
-#ifdef CONFIG_RETHUNK
+#ifdef CONFIG_MITIGATION_RETHUNK
/*
* Rewrite the compiler generated return thunk tail-calls.
@@ -739,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes)
return i;
}
-void __init_or_module noinline apply_returns(s32 *start, s32 *end)
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod)
{
s32 *s;
@@ -748,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *dest = NULL, *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
struct insn insn;
int len, ret;
u8 bytes[16];
u8 op;
- ret = insn_decode_kernel(&insn, addr);
+ ret = insn_decode_kernel(&insn, wr_addr);
if (WARN_ON_ONCE(ret < 0))
continue;
@@ -773,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end)
len = patch_return(addr, &insn, bytes);
if (len == insn.length) {
- DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr);
+ DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr);
DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
- text_poke_early(addr, bytes, len);
+ text_poke_early(wr_addr, bytes, len);
}
}
}
#else
-void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
-#endif /* CONFIG_RETHUNK */
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod) { }
+#endif /* CONFIG_MITIGATION_RETHUNK */
-#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
+#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */
-void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
-void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
+void __init_or_module noinline apply_retpolines(s32 *start, s32 *end,
+ struct module *mod) { }
+void __init_or_module noinline apply_returns(s32 *start, s32 *end,
+ struct module *mod) { }
-#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
+#endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr);
+static void poison_cfi(void *addr, void *wr_addr);
-static void __init_or_module poison_endbr(void *addr, bool warn)
+static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn)
{
u32 endbr, poison = gen_endbr_poison();
- if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
+ if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr)))
return;
if (!is_endbr(endbr)) {
@@ -813,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
*/
DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
- text_poke_early(addr, &poison, 4);
+ text_poke_early(wr_addr, &poison, 4);
}
/*
@@ -822,35 +898,103 @@ static void __init_or_module poison_endbr(void *addr, bool warn)
* Seal the functions for indirect calls by clobbering the ENDBR instructions
* and the kCFI hash value.
*/
-void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end)
+void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr, true);
+ poison_endbr(addr, wr_addr, true);
if (IS_ENABLED(CONFIG_FINEIBT))
- poison_cfi(addr - 16);
+ poison_cfi(addr - 16, wr_addr - 16);
}
}
#else
-void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { }
+void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { }
#endif /* CONFIG_X86_KERNEL_IBT */
-#ifdef CONFIG_FINEIBT
+#ifdef CONFIG_CFI_AUTO_DEFAULT
+#define __CFI_DEFAULT CFI_AUTO
+#elif defined(CONFIG_CFI_CLANG)
+#define __CFI_DEFAULT CFI_KCFI
+#else
+#define __CFI_DEFAULT CFI_OFF
+#endif
-enum cfi_mode {
- CFI_DEFAULT,
- CFI_OFF,
- CFI_KCFI,
- CFI_FINEIBT,
-};
+enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT;
+
+#ifdef CONFIG_CFI_CLANG
+struct bpf_insn;
+
+/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */
+extern unsigned int __bpf_prog_runX(const void *ctx,
+ const struct bpf_insn *insn);
+
+/*
+ * Force a reference to the external symbol so the compiler generates
+ * __kcfi_typid.
+ */
+__ADDRESSABLE(__bpf_prog_runX);
+
+/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_hash,@object \n"
+" .globl cfi_bpf_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_hash: \n"
+" .long __kcfi_typeid___bpf_prog_runX \n"
+" .size cfi_bpf_hash, 4 \n"
+" .popsection \n"
+);
+
+/* Must match bpf_callback_t */
+extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64);
+
+__ADDRESSABLE(__bpf_callback_fn);
+
+/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */
+asm (
+" .pushsection .data..ro_after_init,\"aw\",@progbits \n"
+" .type cfi_bpf_subprog_hash,@object \n"
+" .globl cfi_bpf_subprog_hash \n"
+" .p2align 2, 0x0 \n"
+"cfi_bpf_subprog_hash: \n"
+" .long __kcfi_typeid___bpf_callback_fn \n"
+" .size cfi_bpf_subprog_hash, 4 \n"
+" .popsection \n"
+);
+
+u32 cfi_get_func_hash(void *func)
+{
+ u32 hash;
+
+ func -= cfi_get_offset();
+ switch (cfi_mode) {
+ case CFI_FINEIBT:
+ func += 7;
+ break;
+ case CFI_KCFI:
+ func += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (get_kernel_nofault(hash, func))
+ return 0;
+
+ return hash;
+}
+#endif
+
+#ifdef CONFIG_FINEIBT
-static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
static bool cfi_rand __ro_after_init = true;
static u32 cfi_seed __ro_after_init;
@@ -883,7 +1027,7 @@ static __init int cfi_parse_cmdline(char *str)
}
if (!strcmp(str, "auto")) {
- cfi_mode = CFI_DEFAULT;
+ cfi_mode = CFI_AUTO;
} else if (!strcmp(str, "off")) {
cfi_mode = CFI_OFF;
cfi_rand = false;
@@ -992,7 +1136,7 @@ static u32 decode_caller_hash(void *addr)
}
/* .retpoline_sites */
-static int cfi_disable_callers(s32 *start, s32 *end)
+static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod)
{
/*
* Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
@@ -1004,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
+
if (!hash) /* nocfi callers */
continue;
- text_poke_early(addr, jmp, 2);
+ text_poke_early(wr_addr, jmp, 2);
}
return 0;
}
-static int cfi_enable_callers(s32 *start, s32 *end)
+static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod)
{
/*
* Re-enable kCFI, undo what cfi_disable_callers() did.
@@ -1027,106 +1174,115 @@ static int cfi_enable_callers(s32 *start, s32 *end)
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (!hash) /* nocfi callers */
continue;
- text_poke_early(addr, mov, 2);
+ text_poke_early(wr_addr, mov, 2);
}
return 0;
}
/* .cfi_sites */
-static int cfi_rand_preamble(s32 *start, s32 *end)
+static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(addr);
+ hash = decode_preamble_hash(wr_addr);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
hash = cfi_rehash(hash);
- text_poke_early(addr + 1, &hash, 4);
+ text_poke_early(wr_addr + 1, &hash, 4);
}
return 0;
}
-static int cfi_rewrite_preamble(s32 *start, s32 *end)
+static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
u32 hash;
- hash = decode_preamble_hash(addr);
+ hash = decode_preamble_hash(wr_addr);
if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
addr, addr, 5, addr))
return -EINVAL;
- text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
- WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
- text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
+ text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size);
+ WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678);
+ text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4);
}
return 0;
}
-static void cfi_rewrite_endbr(s32 *start, s32 *end)
+static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr = module_writable_address(mod, addr);
- poison_endbr(addr+16, false);
+ poison_endbr(addr + 16, wr_addr + 16, false);
}
}
/* .retpoline_sites */
-static int cfi_rand_callers(s32 *start, s32 *end)
+static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (hash) {
hash = -cfi_rehash(hash);
- text_poke_early(addr + 2, &hash, 4);
+ text_poke_early(wr_addr + 2, &hash, 4);
}
}
return 0;
}
-static int cfi_rewrite_callers(s32 *start, s32 *end)
+static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod)
{
s32 *s;
for (s = start; s < end; s++) {
void *addr = (void *)s + *s;
+ void *wr_addr;
u32 hash;
addr -= fineibt_caller_size;
- hash = decode_caller_hash(addr);
+ wr_addr = module_writable_address(mod, addr);
+ hash = decode_caller_hash(wr_addr);
if (hash) {
- text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
- WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
- text_poke_early(addr + fineibt_caller_hash, &hash, 4);
+ text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size);
+ WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678);
+ text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4);
}
/* rely on apply_retpolines() */
}
@@ -1135,15 +1291,16 @@ static int cfi_rewrite_callers(s32 *start, s32 *end)
}
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, bool builtin)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
+ bool builtin = mod ? false : true;
int ret;
if (WARN_ONCE(fineibt_preamble_size != 16,
"FineIBT preamble wrong size: %ld", fineibt_preamble_size))
return;
- if (cfi_mode == CFI_DEFAULT) {
+ if (cfi_mode == CFI_AUTO) {
cfi_mode = CFI_KCFI;
if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
cfi_mode = CFI_FINEIBT;
@@ -1154,19 +1311,22 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
* rewrite them. This disables all CFI. If this succeeds but any of the
* later stages fails, we're without CFI.
*/
- ret = cfi_disable_callers(start_retpoline, end_retpoline);
+ ret = cfi_disable_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
if (cfi_rand) {
- if (builtin)
+ if (builtin) {
cfi_seed = get_random_u32();
+ cfi_bpf_hash = cfi_rehash(cfi_bpf_hash);
+ cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash);
+ }
- ret = cfi_rand_preamble(start_cfi, end_cfi);
+ ret = cfi_rand_preamble(start_cfi, end_cfi, mod);
if (ret)
goto err;
- ret = cfi_rand_callers(start_retpoline, end_retpoline);
+ ret = cfi_rand_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
}
@@ -1178,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
return;
case CFI_KCFI:
- ret = cfi_enable_callers(start_retpoline, end_retpoline);
+ ret = cfi_enable_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
@@ -1188,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
case CFI_FINEIBT:
/* place the FineIBT preamble at func()-16 */
- ret = cfi_rewrite_preamble(start_cfi, end_cfi);
+ ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod);
if (ret)
goto err;
/* rewrite the callers to target func()-16 */
- ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+ ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod);
if (ret)
goto err;
/* now that nobody targets func()+0, remove ENDBR there */
- cfi_rewrite_endbr(start_cfi, end_cfi);
+ cfi_rewrite_endbr(start_cfi, end_cfi, mod);
if (builtin)
pr_info("Using FineIBT CFI\n");
@@ -1217,7 +1377,7 @@ static inline void poison_hash(void *addr)
*(u32 *)addr = 0;
}
-static void poison_cfi(void *addr)
+static void poison_cfi(void *addr, void *wr_addr)
{
switch (cfi_mode) {
case CFI_FINEIBT:
@@ -1229,8 +1389,8 @@ static void poison_cfi(void *addr)
* ud2
* 1: nop
*/
- poison_endbr(addr, false);
- poison_hash(addr + fineibt_preamble_hash);
+ poison_endbr(addr, wr_addr, false);
+ poison_hash(wr_addr + fineibt_preamble_hash);
break;
case CFI_KCFI:
@@ -1239,7 +1399,7 @@ static void poison_cfi(void *addr)
* movl $0, %eax
* .skip 11, 0x90
*/
- poison_hash(addr + 1);
+ poison_hash(wr_addr + 1);
break;
default:
@@ -1250,22 +1410,21 @@ static void poison_cfi(void *addr)
#else
static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi, bool builtin)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
}
#ifdef CONFIG_X86_KERNEL_IBT
-static void poison_cfi(void *addr) { }
+static void poison_cfi(void *addr, void *wr_addr) { }
#endif
#endif
void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
- s32 *start_cfi, s32 *end_cfi)
+ s32 *start_cfi, s32 *end_cfi, struct module *mod)
{
return __apply_fineibt(start_retpoline, end_retpoline,
- start_cfi, end_cfi,
- /* .builtin = */ false);
+ start_cfi, end_cfi, mod);
}
#ifdef CONFIG_SMP
@@ -1421,46 +1580,6 @@ int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PARAVIRT
-
-/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void __init_or_module add_nops(void *insns, unsigned int len)
-{
- while (len > 0) {
- unsigned int noplen = len;
- if (noplen > ASM_NOP_MAX)
- noplen = ASM_NOP_MAX;
- memcpy(insns, x86_nops[noplen], noplen);
- insns += noplen;
- len -= noplen;
- }
-}
-
-void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end)
-{
- struct paravirt_patch_site *p;
- char insn_buff[MAX_PATCH_LEN];
-
- for (p = start; p < end; p++) {
- unsigned int used;
-
- BUG_ON(p->len > MAX_PATCH_LEN);
- /* prep the buffer with the original instructions */
- memcpy(insn_buff, p->instr, p->len);
- used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
-
- BUG_ON(used > p->len);
-
- /* Pad the rest with nops */
- add_nops(insn_buff + used, p->len - used);
- text_poke_early(p->instr, insn_buff, p->len);
- }
-}
-extern struct paravirt_patch_site __start_parainstructions[],
- __stop_parainstructions[];
-#endif /* CONFIG_PARAVIRT */
-
/*
* Self-test for the INT3 based CALL emulation code.
*
@@ -1567,7 +1686,7 @@ static noinline void __init alt_reloc_selftest(void)
*/
asm_inline volatile (
ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
- : /* output */
+ : ASM_CALL_CONSTRAINT
: [mem] "m" (__alt_reloc_selftest_addr)
: _ASM_ARG1
);
@@ -1596,43 +1715,22 @@ void __init alternative_instructions(void)
*/
/*
- * Paravirt patching and alternative patching can be combined to
- * replace a function call with a short direct code sequence (e.g.
- * by setting a constant return value instead of doing that in an
- * external function).
- * In order to make this work the following sequence is required:
- * 1. set (artificial) features depending on used paravirt
- * functions which can later influence alternative patching
- * 2. apply paravirt patching (generally replacing an indirect
- * function call with a direct one)
- * 3. apply alternative patching (e.g. replacing a direct function
- * call with a custom code sequence)
- * Doing paravirt patching after alternative patching would clobber
- * the optimization of the custom code with a function call again.
+ * Make sure to set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching.
*/
paravirt_set_cap();
- /*
- * First patch paravirt functions, such that we overwrite the indirect
- * call with the direct call.
- */
- apply_paravirt(__parainstructions, __parainstructions_end);
-
__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
- __cfi_sites, __cfi_sites_end, true);
+ __cfi_sites, __cfi_sites_end, NULL);
/*
* Rewrite the retpolines, must be done before alternatives since
* those can rewrite the retpoline thunks.
*/
- apply_retpolines(__retpoline_sites, __retpoline_sites_end);
- apply_returns(__return_sites, __return_sites_end);
+ apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL);
+ apply_returns(__return_sites, __return_sites_end, NULL);
- /*
- * Then patch alternatives, such that those paravirt calls that are in
- * alternatives can be overwritten by their immediate fragments.
- */
- apply_alternatives(__alt_instructions, __alt_instructions_end);
+ apply_alternatives(__alt_instructions, __alt_instructions_end, NULL);
/*
* Now all calls are established. Apply the call thunks if
@@ -1643,7 +1741,7 @@ void __init alternative_instructions(void)
/*
* Seal all functions that do not have their address taken.
*/
- apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
+ apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL);
#ifdef CONFIG_SMP
/* Patch to UP if other cpus not imminent. */
@@ -1734,7 +1832,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* restoring the previous mm.
*/
if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
- leave_mm(smp_processor_id());
+ leave_mm();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
@@ -1756,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
return temp_state;
}
+__ro_after_init struct mm_struct *poking_mm;
+__ro_after_init unsigned long poking_addr;
+
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
+
switch_mm_irqs_off(NULL, prev_state.mm, current);
+ /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
+ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
+
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
@@ -1769,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
hw_breakpoint_restore();
}
-__ro_after_init struct mm_struct *poking_mm;
-__ro_after_init unsigned long poking_addr;
-
static void text_poke_memcpy(void *dst, const void *src, size_t len)
{
memcpy(dst, src, len);
@@ -1906,7 +2008,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l
* Note that the caller must ensure that if the modified code is part of a
* module, the module would not be removed during poking. This can be achieved
* by registering a module notifier, and ordering module removal and patching
- * trough a mutex.
+ * through a mutex.
*/
void *text_poke(void *addr, const void *opcode, size_t len)
{
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 56a917df410d..c884deca839b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
.get_sgtable = dma_common_get_sgtable,
.dma_supported = dma_direct_supported,
.get_required_mask = dma_direct_get_required_mask,
- .alloc_pages = dma_direct_alloc_pages,
+ .alloc_pages_op = dma_direct_alloc_pages,
.free_pages = dma_direct_free_pages,
};
@@ -776,7 +776,7 @@ int __init gart_iommu_init(void)
iommu_size >> PAGE_SHIFT);
/*
* Tricky. The GART table remaps the physical memory range,
- * so the CPU wont notice potential aliases and if the memory
+ * so the CPU won't notice potential aliases and if the memory
* is remapped to UC later on, we might surprise the PCI devices
* with a stray writeout of a cacheline. So play it sure and
* do an explicit, full-scale wbinvd() _after_ having marked all
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 053f6dcc6b2c..67e773744edb 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -15,61 +15,8 @@
#include <linux/pci_ids.h>
#include <asm/amd_nb.h>
-#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
-#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
-#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
-#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
-#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
-#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
-#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
-#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
-#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
-#define PCI_DEVICE_ID_AMD_1AH_M00H_ROOT 0x153a
-#define PCI_DEVICE_ID_AMD_1AH_M20H_ROOT 0x1507
-#define PCI_DEVICE_ID_AMD_MI200_ROOT 0x14bb
-#define PCI_DEVICE_ID_AMD_MI300_ROOT 0x14f8
-
-#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
-#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
-#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
-#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
-#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
-#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
-#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
-#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
-#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
-#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
-#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
-#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
-#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc
-#define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4 0x12c4
-#define PCI_DEVICE_ID_AMD_MI200_DF_F4 0x14d4
-#define PCI_DEVICE_ID_AMD_MI300_DF_F4 0x152c
-
-/* Protect the PCI config register pairs used for SMN. */
-static DEFINE_MUTEX(smn_mutex);
-
static u32 *flush_words;
-static const struct pci_device_id amd_root_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_ROOT) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_ROOT) },
- {}
-};
-
-#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704
-
static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -79,65 +26,6 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
- {}
-};
-
-static const struct pci_device_id amd_nb_link_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
- { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
- {}
-};
-
-static const struct pci_device_id hygon_root_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_ROOT) },
- {}
-};
-
-static const struct pci_device_id hygon_nb_misc_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
- {}
-};
-
-static const struct pci_device_id hygon_nb_link_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{}
};
@@ -168,135 +56,37 @@ struct amd_northbridge *node_to_amd_nb(int node)
}
EXPORT_SYMBOL_GPL(node_to_amd_nb);
-static struct pci_dev *next_northbridge(struct pci_dev *dev,
- const struct pci_device_id *ids)
-{
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (!dev)
- break;
- } while (!pci_match_id(ids, dev));
- return dev;
-}
-
-static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
-{
- struct pci_dev *root;
- int err = -ENODEV;
-
- if (node >= amd_northbridges.num)
- goto out;
-
- root = node_to_amd_nb(node)->root;
- if (!root)
- goto out;
-
- mutex_lock(&smn_mutex);
-
- err = pci_write_config_dword(root, 0x60, address);
- if (err) {
- pr_warn("Error programming SMN address 0x%x.\n", address);
- goto out_unlock;
- }
-
- err = (write ? pci_write_config_dword(root, 0x64, *value)
- : pci_read_config_dword(root, 0x64, value));
- if (err)
- pr_warn("Error %s SMN address 0x%x.\n",
- (write ? "writing to" : "reading from"), address);
-
-out_unlock:
- mutex_unlock(&smn_mutex);
-
-out:
- return err;
-}
-
-int amd_smn_read(u16 node, u32 address, u32 *value)
-{
- return __amd_smn_rw(node, address, value, false);
-}
-EXPORT_SYMBOL_GPL(amd_smn_read);
-
-int amd_smn_write(u16 node, u32 address, u32 value)
-{
- return __amd_smn_rw(node, address, &value, true);
-}
-EXPORT_SYMBOL_GPL(amd_smn_write);
-
-
static int amd_cache_northbridges(void)
{
- const struct pci_device_id *misc_ids = amd_nb_misc_ids;
- const struct pci_device_id *link_ids = amd_nb_link_ids;
- const struct pci_device_id *root_ids = amd_root_ids;
- struct pci_dev *root, *misc, *link;
struct amd_northbridge *nb;
- u16 roots_per_misc = 0;
- u16 misc_count = 0;
- u16 root_count = 0;
- u16 i, j;
+ u16 i;
if (amd_northbridges.num)
return 0;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- root_ids = hygon_root_ids;
- misc_ids = hygon_nb_misc_ids;
- link_ids = hygon_nb_link_ids;
- }
-
- misc = NULL;
- while ((misc = next_northbridge(misc, misc_ids)))
- misc_count++;
-
- if (!misc_count)
- return -ENODEV;
+ amd_northbridges.num = amd_num_nodes();
- root = NULL;
- while ((root = next_northbridge(root, root_ids)))
- root_count++;
-
- if (root_count) {
- roots_per_misc = root_count / misc_count;
-
- /*
- * There should be _exactly_ N roots for each DF/SMN
- * interface.
- */
- if (!roots_per_misc || (root_count % roots_per_misc)) {
- pr_info("Unsupported AMD DF/PCI configuration found\n");
- return -ENODEV;
- }
- }
-
- nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL);
+ nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
- amd_northbridges.num = misc_count;
- link = misc = root = NULL;
for (i = 0; i < amd_northbridges.num; i++) {
- node_to_amd_nb(i)->root = root =
- next_northbridge(root, root_ids);
- node_to_amd_nb(i)->misc = misc =
- next_northbridge(misc, misc_ids);
- node_to_amd_nb(i)->link = link =
- next_northbridge(link, link_ids);
+ node_to_amd_nb(i)->root = amd_node_get_root(i);
+ node_to_amd_nb(i)->misc = amd_node_get_func(i, 3);
/*
- * If there are more PCI root devices than data fabric/
- * system management network interfaces, then the (N)
- * PCI roots per DF/SMN interface are functionally the
- * same (for DF/SMN access) and N-1 are redundant. N-1
- * PCI roots should be skipped per DF/SMN interface so
- * the following DF/SMN interfaces get mapped to
- * correct PCI roots.
+ * Each Northbridge must have a 'misc' device.
+ * If not, then uninitialize everything.
*/
- for (j = 1; j < roots_per_misc; j++)
- root = next_northbridge(root, root_ids);
+ if (!node_to_amd_nb(i)->misc) {
+ amd_northbridges.num = 0;
+ kfree(nb);
+ return -ENODEV;
+ }
+
+ node_to_amd_nb(i)->link = amd_node_get_func(i, 4);
}
if (amd_gart_present())
@@ -334,7 +124,6 @@ static int amd_cache_northbridges(void)
*/
bool __init early_is_amd_nb(u32 device)
{
- const struct pci_device_id *misc_ids = amd_nb_misc_ids;
const struct pci_device_id *id;
u32 vendor = device & 0xffff;
@@ -342,11 +131,11 @@ bool __init early_is_amd_nb(u32 device)
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return false;
- if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
- misc_ids = hygon_nb_misc_ids;
+ if (cpu_feature_enabled(X86_FEATURE_ZEN))
+ return false;
device >>= 16;
- for (id = misc_ids; id->vendor; id++)
+ for (id = amd_nb_misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return true;
return false;
@@ -354,7 +143,6 @@ bool __init early_is_amd_nb(u32 device)
struct resource *amd_get_mmconfig_range(struct resource *res)
{
- u32 address;
u64 base, msr;
unsigned int segn_busn_bits;
@@ -362,13 +150,11 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
return NULL;
- /* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */
+ if (boot_cpu_data.x86 < 0x10 ||
+ rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr))
return NULL;
- address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, msr);
-
/* mmconfig is not enabled */
if (!(msr & FAM10H_MMIO_CONF_ENABLE))
return NULL;
@@ -386,7 +172,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
int amd_get_subcaches(int cpu)
{
- struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link;
+ struct pci_dev *link = node_to_amd_nb(topology_amd_node_id(cpu))->link;
unsigned int mask;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
@@ -400,7 +186,7 @@ int amd_get_subcaches(int cpu)
int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
- struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu));
+ struct amd_northbridge *nb = node_to_amd_nb(topology_amd_node_id(cpu));
unsigned int reg;
int cuid;
@@ -531,6 +317,10 @@ static __init void fix_erratum_688(void)
static __init int init_amd_nbs(void)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return 0;
+
amd_cache_northbridges();
amd_cache_gart();
diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c
new file mode 100644
index 000000000000..d2ec7fd555c5
--- /dev/null
+++ b/arch/x86/kernel/amd_node.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * AMD Node helper functions and common defines
+ *
+ * Copyright (c) 2024, Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Author: Yazen Ghannam <Yazen.Ghannam@amd.com>
+ */
+
+#include <asm/amd_node.h>
+
+/*
+ * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one
+ * or more nodes per package.
+ *
+ * The nodes are software-visible through PCI config space. All nodes are enumerated
+ * on segment 0 bus 0. The device (slot) numbers range from 0x18 to 0x1F (maximum 8
+ * nodes) with 0x18 corresponding to node 0, 0x19 to node 1, etc. Each node can be a
+ * multi-function device.
+ *
+ * On legacy systems, these node devices represent integrated Northbridge functionality.
+ * On Zen-based systems, these node devices represent Data Fabric functionality.
+ *
+ * See "Configuration Space Accesses" section in BKDGs or
+ * "Processor x86 Core" -> "Configuration Space" section in PPRs.
+ */
+struct pci_dev *amd_node_get_func(u16 node, u8 func)
+{
+ if (node >= MAX_AMD_NUM_NODES)
+ return NULL;
+
+ return pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(AMD_NODE0_PCI_SLOT + node, func));
+}
+
+#define DF_BLK_INST_CNT 0x040
+#define DF_CFG_ADDR_CNTL_LEGACY 0x084
+#define DF_CFG_ADDR_CNTL_DF4 0xC04
+
+#define DF_MAJOR_REVISION GENMASK(27, 24)
+
+static u16 get_cfg_addr_cntl_offset(struct pci_dev *df_f0)
+{
+ u32 reg;
+
+ /*
+ * Revision fields added for DF4 and later.
+ *
+ * Major revision of '0' is found pre-DF4. Field is Read-as-Zero.
+ */
+ if (pci_read_config_dword(df_f0, DF_BLK_INST_CNT, &reg))
+ return 0;
+
+ if (reg & DF_MAJOR_REVISION)
+ return DF_CFG_ADDR_CNTL_DF4;
+
+ return DF_CFG_ADDR_CNTL_LEGACY;
+}
+
+struct pci_dev *amd_node_get_root(u16 node)
+{
+ struct pci_dev *root;
+ u16 cntl_off;
+ u8 bus;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return NULL;
+
+ /*
+ * D18F0xXXX [Config Address Control] (DF::CfgAddressCntl)
+ * Bits [7:0] (SecBusNum) holds the bus number of the root device for
+ * this Data Fabric instance. The segment, device, and function will be 0.
+ */
+ struct pci_dev *df_f0 __free(pci_dev_put) = amd_node_get_func(node, 0);
+ if (!df_f0)
+ return NULL;
+
+ cntl_off = get_cfg_addr_cntl_offset(df_f0);
+ if (!cntl_off)
+ return NULL;
+
+ if (pci_read_config_byte(df_f0, cntl_off, &bus))
+ return NULL;
+
+ /* Grab the pointer for the actual root device instance. */
+ root = pci_get_domain_bus_and_slot(0, bus, 0);
+
+ pci_dbg(root, "is root for AMD node %u\n", node);
+ return root;
+}
+
+static struct pci_dev **amd_roots;
+
+/* Protect the PCI config register pairs used for SMN. */
+static DEFINE_MUTEX(smn_mutex);
+
+#define SMN_INDEX_OFFSET 0x60
+#define SMN_DATA_OFFSET 0x64
+
+/*
+ * SMN accesses may fail in ways that are difficult to detect here in the called
+ * functions amd_smn_read() and amd_smn_write(). Therefore, callers must do
+ * their own checking based on what behavior they expect.
+ *
+ * For SMN reads, the returned value may be zero if the register is Read-as-Zero.
+ * Or it may be a "PCI Error Response", e.g. all 0xFFs. The "PCI Error Response"
+ * can be checked here, and a proper error code can be returned.
+ *
+ * But the Read-as-Zero response cannot be verified here. A value of 0 may be
+ * correct in some cases, so callers must check that this correct is for the
+ * register/fields they need.
+ *
+ * For SMN writes, success can be determined through a "write and read back"
+ * However, this is not robust when done here.
+ *
+ * Possible issues:
+ *
+ * 1) Bits that are "Write-1-to-Clear". In this case, the read value should
+ * *not* match the write value.
+ *
+ * 2) Bits that are "Read-as-Zero"/"Writes-Ignored". This information cannot be
+ * known here.
+ *
+ * 3) Bits that are "Reserved / Set to 1". Ditto above.
+ *
+ * Callers of amd_smn_write() should do the "write and read back" check
+ * themselves, if needed.
+ *
+ * For #1, they can see if their target bits got cleared.
+ *
+ * For #2 and #3, they can check if their target bits got set as intended.
+ *
+ * This matches what is done for RDMSR/WRMSR. As long as there's no #GP, then
+ * the operation is considered a success, and the caller does their own
+ * checking.
+ */
+static int __amd_smn_rw(u8 i_off, u8 d_off, u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_num_nodes())
+ return err;
+
+ root = amd_roots[node];
+ if (!root)
+ return err;
+
+ guard(mutex)(&smn_mutex);
+
+ err = pci_write_config_dword(root, i_off, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ return pcibios_err_to_errno(err);
+ }
+
+ err = (write ? pci_write_config_dword(root, d_off, *value)
+ : pci_read_config_dword(root, d_off, value));
+
+ return pcibios_err_to_errno(err);
+}
+
+int __must_check amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ int err = __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, value, false);
+
+ if (PCI_POSSIBLE_ERROR(*value)) {
+ err = -ENODEV;
+ *value = 0;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int __must_check amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(SMN_INDEX_OFFSET, SMN_DATA_OFFSET, node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+static int amd_cache_roots(void)
+{
+ u16 node, num_nodes = amd_num_nodes();
+
+ amd_roots = kcalloc(num_nodes, sizeof(*amd_roots), GFP_KERNEL);
+ if (!amd_roots)
+ return -ENOMEM;
+
+ for (node = 0; node < num_nodes; node++)
+ amd_roots[node] = amd_node_get_root(node);
+
+ return 0;
+}
+
+static int __init amd_smn_init(void)
+{
+ int err;
+
+ if (!cpu_feature_enabled(X86_FEATURE_ZEN))
+ return 0;
+
+ guard(mutex)(&smn_mutex);
+
+ if (amd_roots)
+ return 0;
+
+ err = amd_cache_roots();
+ if (err)
+ return err;
+
+ return 0;
+}
+
+fs_initcall(amd_smn_init);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 4feaa670d578..89c0c8a3fc7e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -259,10 +259,9 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
order);
}
- /* No multi-function device? */
type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
break;
}
}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 2ee867d796d9..3bf0487cf3b7 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -4,7 +4,7 @@
#
# Leads to non-deterministic coverage that is not a function of syscall inputs.
-# In particualr, smp_apic_timer_interrupt() is called in random places.
+# In particular, smp_apic_timer_interrupt() is called in random places.
KCOV_INSTRUMENT := n
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_common.o apic_noop.o ipi.o vector.o init.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 41093cf20acd..e893dc6f11c1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -19,6 +19,7 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi_pmtmr.h>
+#include <linux/bitmap.h>
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/memblock.h>
@@ -67,10 +68,6 @@
#include "local.h"
-unsigned int num_processors;
-
-unsigned disabled_cpus;
-
/* Processor that is doing the boot up */
u32 boot_cpu_physical_apicid __ro_after_init = BAD_APICID;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
@@ -78,18 +75,6 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
u8 boot_cpu_apic_version __ro_after_init;
/*
- * Bitmask of physically existing CPUs:
- */
-physid_mask_t phys_cpu_present_map;
-
-/*
- * Processor to be disabled specified by kernel parameter
- * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
- * avoid undefined behaviour caused by sending INIT from AP to BSP.
- */
-static u32 disabled_cpu_apicid __ro_after_init = BAD_APICID;
-
-/*
* This variable controls which CPUs receive external NMIs. By default,
* external NMIs are delivered only to the BSP.
*/
@@ -108,14 +93,6 @@ static inline bool apic_accessible(void)
return x2apic_mode || apic_mmio_base;
}
-/*
- * Map cpu index to physical APIC ID
- */
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
-DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
-
#ifdef CONFIG_X86_32
/* Local APIC was disabled by the BIOS and enabled by the kernel */
static int enabled_via_apicbase __ro_after_init;
@@ -261,16 +238,6 @@ u64 native_apic_icr_read(void)
return icr1 | ((u64)icr2 << 32);
}
-#ifdef CONFIG_X86_32
-/**
- * get_physical_broadcast - Get number of physical broadcast IDs
- */
-int get_physical_broadcast(void)
-{
- return modern_apic() ? 0xff : 0xf;
-}
-#endif
-
/**
* lapic_get_maxlvt - get the maximum number of local vector table entries
*/
@@ -473,7 +440,19 @@ static int lapic_timer_shutdown(struct clock_event_device *evt)
v = apic_read(APIC_LVTT);
v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0);
+
+ /*
+ * Setting APIC_LVT_MASKED (above) should be enough to tell
+ * the hardware that this timer will never fire. But AMD
+ * erratum 411 and some Intel CPU behavior circa 2024 say
+ * otherwise. Time for belt and suspenders programming: mask
+ * the timer _and_ zero the counter registers:
+ */
+ if (v & APIC_LVT_TIMER_TSCDEADLINE)
+ wrmsrl(MSR_IA32_TSC_DEADLINE, 0);
+ else
+ apic_write(APIC_TMICT, 0);
+
return 0;
}
@@ -530,32 +509,32 @@ static struct clock_event_device lapic_clockevent = {
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static const struct x86_cpu_id deadline_match[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
+ X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
+ X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
- X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
+ X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20),
- X86_MATCH_INTEL_FAM6_MODEL( HASWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_HASWELL, 0x22),
+ X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
+ X86_MATCH_VFM(INTEL_HASWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL, 0x25),
- X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_G, 0x17),
+ X86_MATCH_VFM(INTEL_BROADWELL, 0x25),
+ X86_MATCH_VFM(INTEL_BROADWELL_G, 0x17),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_L, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE_L, 0xb2),
+ X86_MATCH_VFM(INTEL_SKYLAKE, 0xb2),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE_L, 0x52),
- X86_MATCH_INTEL_FAM6_MODEL( KABYLAKE, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE_L, 0x52),
+ X86_MATCH_VFM(INTEL_KABYLAKE, 0x52),
{},
};
@@ -664,7 +643,7 @@ void lapic_update_tsc_freq(void)
static __initdata int lapic_cal_loops = -1;
static __initdata long lapic_cal_t1, lapic_cal_t2;
static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2;
-static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2;
+static __initdata u32 lapic_cal_pm1, lapic_cal_pm2;
static __initdata unsigned long lapic_cal_j1, lapic_cal_j2;
/*
@@ -674,7 +653,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
{
unsigned long long tsc = 0;
long tapic = apic_read(APIC_TMCCT);
- unsigned long pm = acpi_pm_read_early();
+ u32 pm = acpi_pm_read_early();
if (boot_cpu_has(X86_FEATURE_TSC))
tsc = rdtsc();
@@ -699,7 +678,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
}
static int __init
-calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
+calibrate_by_pmtimer(u32 deltapm, long *delta, long *deltatsc)
{
const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
const long pm_thresh = pm_100ms / 100;
@@ -710,7 +689,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
return -1;
#endif
- apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm);
+ apic_pr_verbose("... PM-Timer delta = %u\n", deltapm);
/* Check, if the PM timer is available */
if (!deltapm)
@@ -720,14 +699,14 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (deltapm > (pm_100ms - pm_thresh) &&
deltapm < (pm_100ms + pm_thresh)) {
- apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n");
+ apic_pr_verbose("... PM-Timer result ok\n");
return 0;
}
res = (((u64)deltapm) * mult) >> 22;
do_div(res, 1000000);
- pr_warn("APIC calibration not consistent "
- "with PM-Timer: %ldms instead of 100ms\n", (long)res);
+ pr_warn("APIC calibration not consistent with PM-Timer: %ldms instead of 100ms\n",
+ (long)res);
/* Correct the lapic counter value */
res = (((u64)(*delta)) * pm_100ms);
@@ -740,9 +719,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
if (boot_cpu_has(X86_FEATURE_TSC)) {
res = (((u64)(*deltatsc)) * pm_100ms);
do_div(res, deltapm);
- apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
- "PM-Timer: %lu (%ld)\n",
- (unsigned long)res, *deltatsc);
+ apic_pr_verbose("TSC delta adjusted to PM-Timer: %lu (%ld)\n",
+ (unsigned long)res, *deltatsc);
*deltatsc = (long)res;
}
@@ -782,7 +760,7 @@ bool __init apic_needs_pit(void)
/*
* If interrupt delivery mode is legacy PIC or virtual wire without
- * configuration, the local APIC timer wont be set up. Make sure
+ * configuration, the local APIC timer won't be set up. Make sure
* that the PIT is initialized.
*/
if (apic_intr_mode == APIC_PIC ||
@@ -825,8 +803,7 @@ static int __init calibrate_APIC_clock(void)
* in the clockevent structure and return.
*/
if (!lapic_init_clockevent()) {
- apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
- lapic_timer_period);
+ apic_pr_verbose("lapic timer already calibrated %d\n", lapic_timer_period);
/*
* Direct calibration methods must have an always running
* local APIC timer, no need for broadcast timer.
@@ -835,8 +812,7 @@ static int __init calibrate_APIC_clock(void)
return 0;
}
- apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
- "calibrating APIC timer ...\n");
+ apic_pr_verbose("Using local APIC timer interrupts. Calibrating APIC timer ...\n");
/*
* There are platforms w/o global clockevent devices. Instead of
@@ -899,7 +875,7 @@ static int __init calibrate_APIC_clock(void)
/* Build delta t1-t2 as apic timer counts down */
delta = lapic_cal_t1 - lapic_cal_t2;
- apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
+ apic_pr_verbose("... lapic delta = %ld\n", delta);
deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1);
@@ -910,22 +886,19 @@ static int __init calibrate_APIC_clock(void)
lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
lapic_init_clockevent();
- apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
- apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
- apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- lapic_timer_period);
+ apic_pr_verbose("..... delta %ld\n", delta);
+ apic_pr_verbose("..... mult: %u\n", lapic_clockevent.mult);
+ apic_pr_verbose("..... calibration result: %u\n", lapic_timer_period);
if (boot_cpu_has(X86_FEATURE_TSC)) {
- apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
- "%ld.%04ld MHz.\n",
- (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
- (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
+ apic_pr_verbose("..... CPU clock speed is %ld.%04ld MHz.\n",
+ (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ),
+ (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ));
}
- apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
- "%u.%04u MHz.\n",
- lapic_timer_period / (1000000 / HZ),
- lapic_timer_period % (1000000 / HZ));
+ apic_pr_verbose("..... host bus clock speed is %u.%04u MHz.\n",
+ lapic_timer_period / (1000000 / HZ),
+ lapic_timer_period % (1000000 / HZ));
/*
* Do a sanity check on the APIC calibration result
@@ -944,7 +917,7 @@ static int __init calibrate_APIC_clock(void)
* available.
*/
if (!pm_referenced && global_clock_event) {
- apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
+ apic_pr_verbose("... verify APIC timer\n");
/*
* Setup the apic timer manually
@@ -965,11 +938,11 @@ static int __init calibrate_APIC_clock(void)
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
- apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj);
+ apic_pr_verbose("... jiffies delta = %lu\n", deltaj);
/* Check, if the jiffies result is consistent */
if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
- apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
+ apic_pr_verbose("... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
}
@@ -1254,9 +1227,8 @@ void __init sync_Arb_IDs(void)
*/
apic_wait_icr_idle();
- apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
- apic_write(APIC_ICR, APIC_DEST_ALLINC |
- APIC_INT_LEVELTRIG | APIC_DM_INIT);
+ apic_pr_debug("Synchronizing Arb IDs.\n");
+ apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
}
enum apic_intr_mode_id apic_intr_mode __ro_after_init;
@@ -1442,10 +1414,10 @@ static void lapic_setup_esr(void)
if (maxlvt > 3)
apic_write(APIC_ESR, 0);
value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08x after: 0x%08x\n",
- oldvalue, value);
+ if (value != oldvalue) {
+ apic_pr_verbose("ESR value before enabling vector: 0x%08x after: 0x%08x\n",
+ oldvalue, value);
+ }
}
#define APIC_IR_REGS APIC_ISR_NR
@@ -1549,9 +1521,6 @@ static void setup_local_APIC(void)
apic_write(APIC_ESR, 0);
}
#endif
- /* Validate that the APIC is registered if required */
- BUG_ON(apic->apic_id_registered && !apic->apic_id_registered());
-
/*
* Intel recommends to set DFR, LDR and TPR before enabling
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
@@ -1635,10 +1604,10 @@ static void setup_local_APIC(void)
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
if (!cpu && (pic_mode || !value || ioapic_is_disabled)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
+ apic_pr_verbose("Masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
@@ -1690,8 +1659,6 @@ void apic_ap_setup(void)
end_local_APIC_setup();
}
-static __init void cpu_set_boot_apic(void);
-
static __init void apic_read_boot_cpu_id(bool x2apic)
{
/*
@@ -1706,7 +1673,8 @@ static __init void apic_read_boot_cpu_id(bool x2apic)
boot_cpu_physical_apicid = read_apic_id();
boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
- cpu_set_boot_apic();
+ topology_register_boot_apic(boot_cpu_physical_apicid);
+ x86_32_probe_bigsmp_early();
}
#ifdef CONFIG_X86_X2APIC
@@ -1724,11 +1692,11 @@ static int x2apic_state;
static bool x2apic_hw_locked(void)
{
- u64 ia32_cap;
+ u64 x86_arch_cap_msr;
u64 msr;
- ia32_cap = x86_read_arch_cap_msr();
- if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) {
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+ if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) {
rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
return (msr & LEGACY_XAPIC_DISABLED);
}
@@ -1808,16 +1776,13 @@ void x2apic_setup(void)
__x2apic_enable();
}
-static __init void apic_set_fixmap(void);
+static __init void apic_set_fixmap(bool read_apic);
static __init void x2apic_disable(void)
{
- u32 x2apic_id, state = x2apic_state;
-
- x2apic_mode = 0;
- x2apic_state = X2APIC_DISABLED;
+ u32 x2apic_id;
- if (state != X2APIC_ON)
+ if (x2apic_state < X2APIC_ON)
return;
x2apic_id = read_apic_id();
@@ -1830,7 +1795,16 @@ static __init void x2apic_disable(void)
}
__x2apic_disable();
- apic_set_fixmap();
+
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ /*
+ * Don't reread the APIC ID as it was already done from
+ * check_x2apic() and the APIC driver still is a x2APIC variant,
+ * which fails to do the read after x2APIC was disabled.
+ */
+ apic_set_fixmap(false);
}
static __init void x2apic_enable(void)
@@ -2091,17 +2065,16 @@ void __init init_apic_mappings(void)
pr_info("APIC: disable apic facility\n");
apic_disable();
}
- num_processors = 1;
}
}
-static __init void apic_set_fixmap(void)
+static __init void apic_set_fixmap(bool read_apic)
{
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
apic_mmio_base = APIC_BASE;
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- apic_mmio_base, mp_lapic_addr);
- apic_read_boot_cpu_id(false);
+ apic_pr_verbose("Mapped APIC to %16lx (%16lx)\n", apic_mmio_base, mp_lapic_addr);
+ if (read_apic)
+ apic_read_boot_cpu_id(false);
}
void __init register_lapic_address(unsigned long address)
@@ -2111,7 +2084,7 @@ void __init register_lapic_address(unsigned long address)
mp_lapic_addr = address;
if (!x2apic_mode)
- apic_set_fixmap();
+ apic_set_fixmap(true);
}
/*
@@ -2202,18 +2175,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
apic_eoi();
atomic_inc(&irq_err_count);
- apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
- smp_processor_id(), v);
+ apic_pr_debug("APIC error on CPU%d: %02x", smp_processor_id(), v);
v &= 0xff;
while (v) {
if (v & 0x1)
- apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+ apic_pr_debug_cont(" : %s", error_interrupt_reason[i]);
i++;
v >>= 1;
}
- apic_printk(APIC_DEBUG, KERN_CONT "\n");
+ apic_pr_debug_cont("\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
}
@@ -2233,8 +2205,7 @@ static void __init connect_bsp_APIC(void)
* PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
* local APIC to INT and NMI lines.
*/
- apic_printk(APIC_VERBOSE, "leaving PIC mode, "
- "enabling APIC mode.\n");
+ apic_pr_verbose("Leaving PIC mode, enabling APIC mode.\n");
imcr_pic_to_apic();
}
#endif
@@ -2259,8 +2230,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
* IPIs, won't work beyond this point! The only exception are
* INIT IPIs.
*/
- apic_printk(APIC_VERBOSE, "disabling APIC mode, "
- "entering PIC mode.\n");
+ apic_pr_verbose("Disabling APIC mode, entering PIC mode.\n");
imcr_apic_to_pic();
return;
}
@@ -2305,155 +2275,6 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-/*
- * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
- * contiguously, it equals to current allocated max logical CPU ID plus 1.
- * All allocated CPU IDs should be in the [0, nr_logical_cpuids) range,
- * so the maximum of nr_logical_cpuids is nr_cpu_ids.
- *
- * NOTE: Reserve 0 for BSP.
- */
-static int nr_logical_cpuids = 1;
-
-/*
- * Used to store mapping between logical CPU IDs and APIC IDs.
- */
-u32 cpuid_to_apicid[] = { [0 ... NR_CPUS - 1] = BAD_APICID, };
-
-bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
-{
- return phys_id == (u64)cpuid_to_apicid[cpu];
-}
-
-#ifdef CONFIG_SMP
-static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
-{
- /* Isolate the SMT bit(s) in the APICID and check for 0 */
- u32 mask = (1U << (fls(smp_num_siblings) - 1)) - 1;
-
- if (smp_num_siblings == 1 || !(apicid & mask))
- cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
-}
-
-/*
- * Due to the utter mess of CPUID evaluation smp_num_siblings is not valid
- * during early boot. Initialize the primary thread mask before SMP
- * bringup.
- */
-static int __init smp_init_primary_thread_mask(void)
-{
- unsigned int cpu;
-
- /*
- * XEN/PV provides either none or useless topology information.
- * Pretend that all vCPUs are primary threads.
- */
- if (xen_pv_domain()) {
- cpumask_copy(&__cpu_primary_thread_mask, cpu_possible_mask);
- return 0;
- }
-
- for (cpu = 0; cpu < nr_logical_cpuids; cpu++)
- cpu_mark_primary_thread(cpu, cpuid_to_apicid[cpu]);
- return 0;
-}
-early_initcall(smp_init_primary_thread_mask);
-#else
-static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
-#endif
-
-/*
- * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
- * and cpuid_to_apicid[] synchronized.
- */
-static int allocate_logical_cpuid(int apicid)
-{
- int i;
-
- /*
- * cpuid <-> apicid mapping is persistent, so when a cpu is up,
- * check if the kernel has allocated a cpuid for it.
- */
- for (i = 0; i < nr_logical_cpuids; i++) {
- if (cpuid_to_apicid[i] == apicid)
- return i;
- }
-
- /* Allocate a new cpuid. */
- if (nr_logical_cpuids >= nr_cpu_ids) {
- WARN_ONCE(1, "APIC: NR_CPUS/possible_cpus limit of %u reached. "
- "Processor %d/0x%x and the rest are ignored.\n",
- nr_cpu_ids, nr_logical_cpuids, apicid);
- return -EINVAL;
- }
-
- cpuid_to_apicid[nr_logical_cpuids] = apicid;
- return nr_logical_cpuids++;
-}
-
-static void cpu_update_apic(int cpu, u32 apicid)
-{
-#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
- early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-#endif
- set_cpu_possible(cpu, true);
- physid_set(apicid, phys_cpu_present_map);
- set_cpu_present(cpu, true);
- num_processors++;
-
- if (system_state != SYSTEM_BOOTING)
- cpu_mark_primary_thread(cpu, apicid);
-}
-
-static __init void cpu_set_boot_apic(void)
-{
- cpuid_to_apicid[0] = boot_cpu_physical_apicid;
- cpu_update_apic(0, boot_cpu_physical_apicid);
- x86_32_probe_bigsmp_early();
-}
-
-int generic_processor_info(int apicid)
-{
- int cpu, max = nr_cpu_ids;
-
- /* The boot CPU must be set before MADT/MPTABLE parsing happens */
- if (cpuid_to_apicid[0] == BAD_APICID)
- panic("Boot CPU APIC not registered yet\n");
-
- if (apicid == boot_cpu_physical_apicid)
- return 0;
-
- if (disabled_cpu_apicid == apicid) {
- int thiscpu = num_processors + disabled_cpus;
-
- pr_warn("APIC: Disabling requested cpu. Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
-
- disabled_cpus++;
- return -ENODEV;
- }
-
- if (num_processors >= nr_cpu_ids) {
- int thiscpu = max + disabled_cpus;
-
- pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. "
- "Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
-
- disabled_cpus++;
- return -EINVAL;
- }
-
- cpu = allocate_logical_cpuid(apicid);
- if (cpu < 0) {
- disabled_cpus++;
- return -EINVAL;
- }
-
- cpu_update_apic(cpu, apicid);
- return cpu;
-}
-
-
void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
bool dmar)
{
@@ -2496,10 +2317,7 @@ EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
static void __init apic_bsp_up_setup(void)
{
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, apic->set_apic_id(boot_cpu_physical_apicid));
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+ reset_phys_cpu_present_map(boot_cpu_physical_apicid);
}
/**
@@ -2764,19 +2582,12 @@ int apic_is_clustered_box(void)
/*
* APIC command line parameters
*/
-static int __init setup_disableapic(char *arg)
+static int __init setup_nolapic(char *arg)
{
apic_is_disabled = true;
setup_clear_cpu_cap(X86_FEATURE_APIC);
return 0;
}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static int __init setup_nolapic(char *arg)
-{
- return setup_disableapic(arg);
-}
early_param("nolapic", setup_nolapic);
static int __init parse_lapic_timer_c2_ok(char *arg)
@@ -2845,15 +2656,6 @@ static int __init lapic_insert_resource(void)
*/
late_initcall(lapic_insert_resource);
-static int __init apic_set_disabled_cpu_apicid(char *arg)
-{
- if (!arg || !get_option(&arg, &disabled_cpu_apicid))
- return -EINVAL;
-
- return 0;
-}
-early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
-
static int __init apic_set_extnmi(char *arg)
{
if (!arg)
diff --git a/arch/x86/kernel/apic/apic_common.c b/arch/x86/kernel/apic/apic_common.c
index 8a00141073ea..9ef3be866832 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -18,16 +18,6 @@ u32 apic_flat_calc_apicid(unsigned int cpu)
return 1U << cpu;
}
-bool default_check_apicid_used(physid_mask_t *map, u32 apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- *retmap = *phys_map;
-}
-
u32 default_cpu_present_to_apicid(int mps_cpu)
{
if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -37,11 +27,6 @@ u32 default_cpu_present_to_apicid(int mps_cpu)
}
EXPORT_SYMBOL_GPL(default_cpu_present_to_apicid);
-bool default_apic_id_registered(void)
-{
- return physid_isset(read_apic_id(), phys_cpu_present_map);
-}
-
/*
* Set up the logical destination ID when the APIC operates in logical
* destination mode.
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7139867d69cd..e0308d8c4e6c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -8,143 +8,25 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/cpumask.h>
#include <linux/export.h>
-#include <linux/acpi.h>
-#include <asm/jailhouse_para.h>
#include <asm/apic.h>
#include "local.h"
-static struct apic apic_physflat;
-static struct apic apic_flat;
-
-struct apic *apic __ro_after_init = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 1;
-}
-
-static void _flat_send_IPI_mask(unsigned long mask, int vector)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
- local_irq_restore(flags);
-}
-
-static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static void
-flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
-{
- unsigned long mask = cpumask_bits(cpumask)[0];
- int cpu = smp_processor_id();
-
- if (cpu < BITS_PER_LONG)
- __clear_bit(cpu, &mask);
-
- _flat_send_IPI_mask(mask, vector);
-}
-
-static u32 flat_get_apic_id(u32 x)
+static u32 physflat_get_apic_id(u32 x)
{
return (x >> 24) & 0xFF;
}
-static u32 set_apic_id(u32 id)
-{
- return (id & 0xFF) << 24;
-}
-
-static u32 flat_phys_pkg_id(u32 initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
-static int flat_probe(void)
+static int physflat_probe(void)
{
return 1;
}
-static struct apic apic_flat __ro_after_init = {
- .name = "flat",
- .probe = flat_probe,
- .acpi_madt_oem_check = flat_acpi_madt_oem_check,
- .apic_id_registered = default_apic_id_registered,
-
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
- .dest_mode_logical = true,
-
- .disable_esr = 0,
-
- .init_apic_ldr = default_init_apic_ldr,
- .cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = flat_phys_pkg_id,
-
- .max_apic_id = 0xFE,
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
-
- .calc_dest_apicid = apic_flat_calc_apicid,
-
- .send_IPI = default_send_IPI_single,
- .send_IPI_mask = flat_send_IPI_mask,
- .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
- .send_IPI_allbutself = default_send_IPI_allbutself,
- .send_IPI_all = default_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
- .nmi_to_offline_cpu = true,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi = native_apic_mem_eoi,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = apic_mem_wait_icr_idle,
- .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
-};
-
-/*
- * Physflat mode is used when there are more than 8 CPUs on a system.
- * We cannot use logical delivery in this case because the mask
- * overflows, so use physical mode.
- */
static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
-#ifdef CONFIG_ACPI
- /*
- * Quirk: some x86_64 machines can only use physical APIC mode
- * regardless of how many processors are present (x86_64 ES7000
- * is an example).
- */
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
- printk(KERN_DEBUG "system APIC only can use physical flat");
- return 1;
- }
-
- if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
- printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
- return 1;
- }
-#endif
-
- return 0;
-}
-
-static int physflat_probe(void)
-{
- return apic == &apic_physflat || num_possible_cpus() > 8 || jailhouse_paravirt();
+ return 1;
}
static struct apic apic_physflat __ro_after_init = {
@@ -152,19 +34,15 @@ static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
- .apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = flat_phys_pkg_id,
.max_apic_id = 0xFE,
- .get_apic_id = flat_get_apic_id,
- .set_apic_id = set_apic_id,
+ .get_apic_id = physflat_get_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -184,8 +62,7 @@ static struct apic apic_physflat __ro_after_init = {
.wait_icr_idle = apic_mem_wait_icr_idle,
.safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout,
};
+apic_driver(apic_physflat);
-/*
- * We need to check for physflat first, so this order is important.
- */
-apic_drivers(apic_physflat, apic_flat);
+struct apic *apic __ro_after_init = &apic_physflat;
+EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b00d52ae84fa..b5bb7a2e8340 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -29,7 +29,6 @@ static void noop_send_IPI_self(int vector) { }
static void noop_apic_icr_write(u32 low, u32 id) { }
static int noop_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip) { return -1; }
static u64 noop_apic_icr_read(void) { return 0; }
-static u32 noop_phys_pkg_id(u32 cpuid_apic, int index_msb) { return 0; }
static u32 noop_get_apic_id(u32 apicid) { return 0; }
static void noop_apic_eoi(void) { }
@@ -47,17 +46,12 @@ static void noop_apic_write(u32 reg, u32 val)
struct apic apic_noop __ro_after_init = {
.name = "noop",
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
- .check_apicid_used = default_check_apicid_used,
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = noop_phys_pkg_id,
-
.max_apic_id = 0xFE,
.get_apic_id = noop_get_apic_id,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 456a14c44f67..16410f087b7a 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -38,11 +38,6 @@ static u32 numachip1_get_apic_id(u32 x)
return id;
}
-static u32 numachip1_set_apic_id(u32 id)
-{
- return (id & 0xff) << 24;
-}
-
static u32 numachip2_get_apic_id(u32 x)
{
u64 mcfg;
@@ -51,16 +46,6 @@ static u32 numachip2_get_apic_id(u32 x)
return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24);
}
-static u32 numachip2_set_apic_id(u32 id)
-{
- return id << 24;
-}
-
-static u32 numachip_phys_pkg_id(u32 initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
static void numachip1_apic_icr_write(int apicid, unsigned int val)
{
write_lcsr(CSR_G3_EXT_IRQ_GEN, (apicid << 16) | val);
@@ -222,17 +207,14 @@ static const struct apic apic_numachip1 __refconst = {
.probe = numachip1_probe,
.acpi_madt_oem_check = numachip1_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = numachip_phys_pkg_id,
.max_apic_id = UINT_MAX,
.get_apic_id = numachip1_get_apic_id,
- .set_apic_id = numachip1_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
@@ -259,17 +241,14 @@ static const struct apic apic_numachip2 __refconst = {
.probe = numachip2_probe,
.acpi_madt_oem_check = numachip2_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = numachip_phys_pkg_id,
.max_apic_id = UINT_MAX,
.get_apic_id = numachip2_get_apic_id,
- .set_apic_id = numachip2_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 7ee3c486cb33..9285d500d5b4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -18,22 +18,6 @@ static u32 bigsmp_get_apic_id(u32 x)
return (x >> 24) & 0xFF;
}
-static bool bigsmp_check_apicid_used(physid_mask_t *map, u32 apicid)
-{
- return false;
-}
-
-static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static u32 bigsmp_phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
static void bigsmp_send_IPI_allbutself(int vector)
{
default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
@@ -80,19 +64,14 @@ static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 1,
- .check_apicid_used = bigsmp_check_apicid_used,
- .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = bigsmp_phys_pkg_id,
.max_apic_id = 0xFE,
.get_apic_id = bigsmp_get_apic_id,
- .set_apic_id = NULL,
.calc_dest_apicid = apic_default_calc_apicid,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 00da6cf6b07d..eebc360ed1bb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -86,8 +86,8 @@ static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
struct irq_pin_list {
- struct list_head list;
- int apic, pin;
+ struct list_head list;
+ int apic, pin;
};
struct mp_chip_data {
@@ -96,7 +96,7 @@ struct mp_chip_data {
bool is_level;
bool active_low;
bool isa_irq;
- u32 count;
+ u32 count;
};
struct mp_ioapic_gsi {
@@ -105,21 +105,17 @@ struct mp_ioapic_gsi {
};
static struct ioapic {
- /*
- * # of IRQ routing registers
- */
- int nr_registers;
- /*
- * Saved state during suspend/resume, or while enabling intr-remap.
- */
- struct IO_APIC_route_entry *saved_registers;
+ /* # of IRQ routing registers */
+ int nr_registers;
+ /* Saved state during suspend/resume, or while enabling intr-remap. */
+ struct IO_APIC_route_entry *saved_registers;
/* I/O APIC config */
- struct mpc_ioapic mp_config;
+ struct mpc_ioapic mp_config;
/* IO APIC gsi routing info */
- struct mp_ioapic_gsi gsi_config;
- struct ioapic_domain_cfg irqdomain_cfg;
- struct irq_domain *irqdomain;
- struct resource *iomem_res;
+ struct mp_ioapic_gsi gsi_config;
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -205,10 +201,9 @@ void mp_save_irq(struct mpc_intsrc *m)
{
int i;
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ apic_pr_verbose("Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
for (i = 0; i < mp_irq_entries; i++) {
if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
@@ -269,12 +264,14 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(vector, &io_apic->eoi);
}
unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
+
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
@@ -300,14 +297,8 @@ static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- struct IO_APIC_route_entry entry;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- entry = __ioapic_read_entry(apic, pin);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return entry;
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
+ return __ioapic_read_entry(apic, pin);
}
/*
@@ -324,11 +315,8 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__ioapic_write_entry(apic, pin, e);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -339,12 +327,10 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
static void ioapic_mask_entry(int apic, int pin)
{
struct IO_APIC_route_entry e = { .masked = true };
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
io_apic_write(apic, 0x10 + 2*pin, e.w1);
io_apic_write(apic, 0x11 + 2*pin, e.w2);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -352,68 +338,39 @@ static void ioapic_mask_entry(int apic, int pin)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
-static int __add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
+static bool add_pin_to_irq_node(struct mp_chip_data *data, int node, int apic, int pin)
{
struct irq_pin_list *entry;
- /* don't allow duplicates */
- for_each_irq_pin(entry, data->irq_2_pin)
+ /* Don't allow duplicates */
+ for_each_irq_pin(entry, data->irq_2_pin) {
if (entry->apic == apic && entry->pin == pin)
- return 0;
+ return true;
+ }
entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
if (!entry) {
- pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
- node, apic, pin);
- return -ENOMEM;
+ pr_err("Cannot allocate irq_pin_list (%d,%d,%d)\n", node, apic, pin);
+ return false;
}
+
entry->apic = apic;
entry->pin = pin;
list_add_tail(&entry->list, &data->irq_2_pin);
-
- return 0;
+ return true;
}
static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
{
struct irq_pin_list *tmp, *entry;
- list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
+ list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list) {
if (entry->apic == apic && entry->pin == pin) {
list_del(&entry->list);
kfree(entry);
return;
}
-}
-
-static void add_pin_to_irq_node(struct mp_chip_data *data,
- int node, int apic, int pin)
-{
- if (__add_pin_to_irq_node(data, node, apic, pin))
- panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
- int oldapic, int oldpin,
- int newapic, int newpin)
-{
- struct irq_pin_list *entry;
-
- for_each_irq_pin(entry, data->irq_2_pin) {
- if (entry->apic == oldapic && entry->pin == oldpin) {
- entry->apic = newapic;
- entry->pin = newpin;
- /* every one is different, right? */
- return;
- }
}
-
- /* old apic/pin didn't exist, so just add new ones */
- add_pin_to_irq_node(data, node, newapic, newpin);
}
static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
@@ -430,12 +387,12 @@ static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
}
}
+/*
+ * Synchronize the IO-APIC and the CPU by doing a dummy read from the
+ * IO-APIC
+ */
static void io_apic_sync(struct irq_pin_list *entry)
{
- /*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
struct io_apic __iomem *io_apic;
io_apic = io_apic_base(entry->apic);
@@ -445,11 +402,9 @@ static void io_apic_sync(struct irq_pin_list *entry)
static void mask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
io_apic_modify_irq(data, true, &io_apic_sync);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void __unmask_ioapic(struct mp_chip_data *data)
@@ -460,11 +415,9 @@ static void __unmask_ioapic(struct mp_chip_data *data)
static void unmask_ioapic_irq(struct irq_data *irq_data)
{
struct mp_chip_data *data = irq_data->chip_data;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__unmask_ioapic(data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -492,30 +445,24 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
entry = entry1 = __ioapic_read_entry(apic, pin);
- /*
- * Mask the entry and change the trigger mode to edge.
- */
+ /* Mask the entry and change the trigger mode to edge. */
entry1.masked = true;
entry1.is_level = false;
__ioapic_write_entry(apic, pin, entry1);
- /*
- * Restore the previous level triggered entry.
- */
+ /* Restore the previous level triggered entry. */
__ioapic_write_entry(apic, pin, entry);
}
}
static void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
{
- unsigned long flags;
struct irq_pin_list *entry;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin)
__eoi_ioapic_pin(entry->apic, entry->pin, vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -538,8 +485,6 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
}
if (entry.irr) {
- unsigned long flags;
-
/*
* Make sure the trigger mode is set to level. Explicit EOI
* doesn't clear the remote-IRR if the trigger mode is not
@@ -549,9 +494,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
entry.is_level = true;
ioapic_write_entry(apic, pin, entry);
}
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
__eoi_ioapic_pin(apic, pin, entry.vector);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
@@ -586,28 +530,23 @@ static int pirq_entries[MAX_PIRQS] = {
static int __init ioapic_pirq_setup(char *str)
{
- int i, max;
- int ints[MAX_PIRQS+1];
+ int i, max, ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
- apic_printk(APIC_VERBOSE, KERN_INFO
- "PIRQ redirection, working around broken MP-BIOS.\n");
+ apic_pr_verbose("PIRQ redirection, working around broken MP-BIOS.\n");
+
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
+ apic_pr_verbose("... PIRQ%d -> IRQ %d\n", i, ints[i + 1]);
+ /* PIRQs are mapped upside down, usually */
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
-
__setup("pirq=", ioapic_pirq_setup);
#endif /* CONFIG_X86_32 */
@@ -626,8 +565,7 @@ int save_ioapic_entries(void)
}
for_each_pin(apic, pin)
- ioapics[apic].saved_registers[pin] =
- ioapic_read_entry(apic, pin);
+ ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin);
}
return err;
@@ -668,8 +606,7 @@ int restore_ioapic_entries(void)
continue;
for_each_pin(apic, pin)
- ioapic_write_entry(apic, pin,
- ioapics[apic].saved_registers[pin]);
+ ioapic_write_entry(apic, pin, ioapics[apic].saved_registers[pin]);
}
return 0;
}
@@ -681,12 +618,13 @@ static int find_irq_entry(int ioapic_idx, int pin, int type)
{
int i;
- for (i = 0; i < mp_irq_entries; i++)
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].irqtype == type &&
(mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) ||
mp_irqs[i].dstapic == MP_APIC_ALL) &&
mp_irqs[i].dstirq == pin)
return i;
+ }
return -1;
}
@@ -701,10 +639,8 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
-
return mp_irqs[i].dstirq;
}
return -1;
@@ -717,8 +653,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
- if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].irqtype == type) &&
+ if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].irqtype == type) &&
(mp_irqs[i].srcbusirq == irq))
break;
}
@@ -726,9 +661,10 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int ioapic_idx;
- for_each_ioapic(ioapic_idx)
+ for_each_ioapic(ioapic_idx) {
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
return ioapic_idx;
+ }
}
return -1;
@@ -769,8 +705,7 @@ static bool EISA_ELCR(unsigned int irq)
unsigned int port = PIC_ELCR1 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
+ apic_pr_verbose("Broken MPtable reports ISA irq %d\n", irq);
return false;
}
@@ -947,9 +882,9 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
struct irq_alloc_info *info)
{
+ int type = ioapics[ioapic].irqdomain_cfg.type;
bool legacy = false;
int irq = -1;
- int type = ioapics[ioapic].irqdomain_cfg.type;
switch (type) {
case IOAPIC_DOMAIN_LEGACY:
@@ -971,8 +906,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
return -1;
}
- return __irq_domain_alloc_irqs(domain, irq, 1,
- ioapic_alloc_attr_node(info),
+ return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info),
info, legacy, NULL);
}
@@ -986,29 +920,26 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
* PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
* multiple pins sharing the same legacy IRQ number when ACPI is disabled.
*/
-static int alloc_isa_irq_from_domain(struct irq_domain *domain,
- int irq, int ioapic, int pin,
+static int alloc_isa_irq_from_domain(struct irq_domain *domain, int irq, int ioapic, int pin,
struct irq_alloc_info *info)
{
- struct mp_chip_data *data;
struct irq_data *irq_data = irq_get_irq_data(irq);
int node = ioapic_alloc_attr_node(info);
+ struct mp_chip_data *data;
/*
* Legacy ISA IRQ has already been allocated, just add pin to
* the pin list associated with this IRQ and program the IOAPIC
- * entry. The IOAPIC entry
+ * entry.
*/
if (irq_data && irq_data->parent_data) {
if (!mp_check_pin_attr(irq, info))
return -EBUSY;
- if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
- info->ioapic.pin))
+ if (!add_pin_to_irq_node(irq_data->chip_data, node, ioapic, info->ioapic.pin))
return -ENOMEM;
} else {
info->flags |= X86_IRQ_ALLOC_LEGACY;
- irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true,
- NULL);
+ irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, NULL);
if (irq >= 0) {
irq_data = irq_domain_get_irq_data(domain, irq);
data = irq_data->chip_data;
@@ -1022,11 +953,11 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
unsigned int flags, struct irq_alloc_info *info)
{
- int irq;
- bool legacy = false;
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
struct irq_alloc_info tmp;
struct mp_chip_data *data;
- struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ bool legacy = false;
+ int irq;
if (!domain)
return -ENOSYS;
@@ -1046,7 +977,7 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
return -EINVAL;
}
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (!(flags & IOAPIC_MAP_ALLOC)) {
if (!legacy) {
irq = irq_find_mapping(domain, pin);
@@ -1067,8 +998,6 @@ static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
data->count++;
}
}
- mutex_unlock(&ioapic_mutex);
-
return irq;
}
@@ -1076,26 +1005,20 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
{
u32 gsi = mp_pin_to_gsi(ioapic, pin);
- /*
- * Debugging check, we are in big trouble if this message pops up!
- */
+ /* Debugging check, we are in big trouble if this message pops up! */
if (mp_irqs[idx].dstirq != pin)
pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
#ifdef CONFIG_X86_32
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
+ /* PCI IRQ command line redirection. Yes, limits are hardcoded. */
if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "disabling PIRQ%d\n", pin-16);
+ if (pirq_entries[pin - 16] != -1) {
+ if (!pirq_entries[pin - 16]) {
+ apic_pr_verbose("Disabling PIRQ%d\n", pin - 16);
} else {
int irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
+
+ apic_pr_verbose("Using PIRQ%d -> IRQ %d\n", pin - 16, irq);
return irq;
}
}
@@ -1133,10 +1056,9 @@ void mp_unmap_irq(int irq)
if (!data || data->isa_irq)
return;
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
if (--data->count == 0)
irq_domain_free_irqs(irq, 1);
- mutex_unlock(&ioapic_mutex);
}
/*
@@ -1147,12 +1069,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
int irq, i, best_ioapic = -1, best_idx = -1;
- apic_printk(APIC_DEBUG,
- "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
- bus, slot, pin);
+ apic_pr_debug("Querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
if (test_bit(bus, mp_bus_not_pci)) {
- apic_printk(APIC_VERBOSE,
- "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ apic_pr_verbose("PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
@@ -1197,8 +1117,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return -1;
out:
- return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
- IOAPIC_MAP_ALLOC);
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq, IOAPIC_MAP_ALLOC);
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
@@ -1209,17 +1128,16 @@ static void __init setup_IO_APIC_irqs(void)
unsigned int ioapic, pin;
int idx;
- apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+ apic_pr_verbose("Init IO_APIC IRQs\n");
for_each_ioapic_pin(ioapic, pin) {
idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- apic_printk(APIC_VERBOSE,
- KERN_DEBUG " apic %d pin %d not connected\n",
- mpc_ioapic_id(ioapic), pin);
- else
- pin_2_irq(idx, ioapic, pin,
- ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ if (idx < 0) {
+ apic_pr_verbose("apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ } else {
+ pin_2_irq(idx, ioapic, pin, ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
}
@@ -1234,26 +1152,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
char buf[256];
int i;
- printk(KERN_DEBUG "IOAPIC %d:\n", apic);
+ apic_dbg("IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
entry = ioapic_read_entry(apic, i);
- snprintf(buf, sizeof(buf),
- " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
- i,
- entry.masked ? "disabled" : "enabled ",
+ snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+ i, entry.masked ? "disabled" : "enabled ",
entry.is_level ? "level" : "edge ",
entry.active_low ? "low " : "high",
entry.vector, entry.irr, entry.delivery_status);
if (entry.ir_format) {
- printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf,
- (entry.ir_index_15 << 15) | entry.ir_index_0_14,
- entry.ir_zero);
+ apic_dbg("%s, remapped, I(%04X), Z(%X)\n", buf,
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero);
} else {
- printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf,
- entry.dest_mode_logical ? "logical " : "physical",
- entry.virt_destid_8_14, entry.destid_0_7,
- entry.delivery_mode);
+ apic_dbg("%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7, entry.delivery_mode);
}
}
}
@@ -1264,30 +1177,25 @@ static void __init print_IO_APIC(int ioapic_idx)
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- reg_01.raw = io_apic_read(ioapic_idx, 1);
- if (reg_01.bits.version >= 0x10)
- reg_02.raw = io_apic_read(ioapic_idx, 2);
- if (reg_01.bits.version >= 0x20)
- reg_03.raw = io_apic_read(ioapic_idx, 3);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
- printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
- printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
- printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
- printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
-
- printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
- printk(KERN_DEBUG "....... : max redirection entries: %02X\n",
- reg_01.bits.entries);
-
- printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
- printk(KERN_DEBUG "....... : IO APIC version: %02X\n",
- reg_01.bits.version);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ reg_01.raw = io_apic_read(ioapic_idx, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(ioapic_idx, 2);
+ if (reg_01.bits.version >= 0x20)
+ reg_03.raw = io_apic_read(ioapic_idx, 3);
+ }
+
+ apic_dbg("IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx));
+ apic_dbg(".... register #00: %08X\n", reg_00.raw);
+ apic_dbg("....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ apic_dbg("....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
+ apic_dbg("....... : LTS : %X\n", reg_00.bits.LTS);
+ apic_dbg(".... register #01: %08X\n", *(int *)&reg_01);
+ apic_dbg("....... : max redirection entries: %02X\n", reg_01.bits.entries);
+ apic_dbg("....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ apic_dbg("....... : IO APIC version: %02X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1295,8 +1203,8 @@ static void __init print_IO_APIC(int ioapic_idx)
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
- printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ apic_dbg(".... register #02: %08X\n", reg_02.raw);
+ apic_dbg("....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
@@ -1306,11 +1214,11 @@ static void __init print_IO_APIC(int ioapic_idx)
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
- printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
- printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
+ apic_dbg(".... register #03: %08X\n", reg_03.raw);
+ apic_dbg("....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
- printk(KERN_DEBUG ".... IRQ redirection table:\n");
+ apic_dbg(".... IRQ redirection table:\n");
io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
}
@@ -1319,11 +1227,11 @@ void __init print_IO_APICs(void)
int ioapic_idx;
unsigned int irq;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for_each_ioapic(ioapic_idx)
- printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mpc_ioapic_id(ioapic_idx),
- ioapics[ioapic_idx].nr_registers);
+ apic_dbg("number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for_each_ioapic(ioapic_idx) {
+ apic_dbg("number of IO-APIC #%d registers: %d.\n",
+ mpc_ioapic_id(ioapic_idx), ioapics[ioapic_idx].nr_registers);
+ }
/*
* We are a bit conservative about what we expect. We have to
@@ -1334,7 +1242,7 @@ void __init print_IO_APICs(void)
for_each_ioapic(ioapic_idx)
print_IO_APIC(ioapic_idx);
- printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ apic_dbg("IRQ to pin mappings:\n");
for_each_active_irq(irq) {
struct irq_pin_list *entry;
struct irq_chip *chip;
@@ -1349,7 +1257,7 @@ void __init print_IO_APICs(void)
if (list_empty(&data->irq_2_pin))
continue;
- printk(KERN_DEBUG "IRQ%d ", irq);
+ apic_dbg("IRQ%d ", irq);
for_each_irq_pin(entry, data->irq_2_pin)
pr_cont("-> %d:%d", entry->apic, entry->pin);
pr_cont("\n");
@@ -1363,8 +1271,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- int i8259_apic, i8259_pin;
- int apic, pin;
+ int i8259_apic, i8259_pin, apic, pin;
if (ioapic_is_disabled)
nr_ioapics = 0;
@@ -1376,19 +1283,21 @@ void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
+ /*
+ * If the interrupt line is enabled and in ExtInt mode I
+ * have found the pin where the i8259 is connected.
*/
- if (!entry.masked &&
- entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
+ if (!entry.masked && entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
- goto found_i8259;
+ break;
}
}
- found_i8259:
- /* Look to see what if the MP table has reported the ExtINT */
- /* If we could not find the appropriate pin by looking at the ioapic
+
+ /*
+ * Look to see what if the MP table has reported the ExtINT
+ *
+ * If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
@@ -1396,29 +1305,24 @@ void __init enable_IO_APIC(void)
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
- printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
+ pr_warn("ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
- (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
- {
- printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
- }
+ (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
+ pr_warn("ExtINT in hardware and MP table differ\n");
- /*
- * Do not trust the IO-APIC being empty at bootup
- */
+ /* Do not trust the IO-APIC being empty at bootup */
clear_IO_APIC();
}
void native_restore_boot_irq_mode(void)
{
/*
- * If the i8259 is routed through an IOAPIC
- * Put that IOAPIC in virtual wire mode
- * so legacy interrupts can be delivered.
+ * If the i8259 is routed through an IOAPIC Put that IOAPIC in
+ * virtual wire mode so legacy interrupts can be delivered.
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
@@ -1433,9 +1337,7 @@ void native_restore_boot_irq_mode(void)
entry.destid_0_7 = apic_id & 0xFF;
entry.virt_destid_8_14 = apic_id >> 8;
- /*
- * Add it to the IO-APIC irq-routing table:
- */
+ /* Add it to the IO-APIC irq-routing table */
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
@@ -1458,37 +1360,34 @@ void restore_boot_irq_mode(void)
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-void __init setup_ioapic_ids_from_mpc_nocheck(void)
+static void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
+ DECLARE_BITMAP(phys_id_present_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- physid_mask_t phys_id_present_map;
- int ioapic_idx;
- int i;
unsigned char old_id;
- unsigned long flags;
+ int ioapic_idx, i;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
+ copy_phys_cpu_present_map(phys_id_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
old_id = mpc_ioapic_id(ioapic_idx);
- if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- ioapic_idx, mpc_ioapic_id(ioapic_idx));
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- reg_00.bits.ID);
+ if (mpc_ioapic_id(ioapic_idx) >= broadcast_id) {
+ pr_err(FW_BUG "IO-APIC#%d ID is %d in the MPC table!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID);
ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID;
}
@@ -1497,61 +1396,54 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
- if (apic->check_apicid_used(&phys_id_present_map,
- mpc_ioapic_id(ioapic_idx))) {
- printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- ioapic_idx, mpc_ioapic_id(ioapic_idx));
- for (i = 0; i < get_physical_broadcast(); i++)
- if (!physid_isset(i, phys_id_present_map))
+ if (test_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map)) {
+ pr_err(FW_BUG "IO-APIC#%d ID %d is already used!...\n",
+ ioapic_idx, mpc_ioapic_id(ioapic_idx));
+ for (i = 0; i < broadcast_id; i++)
+ if (!test_bit(i, phys_id_present_map))
break;
- if (i >= get_physical_broadcast())
+ if (i >= broadcast_id)
panic("Max APIC ID exceeded!\n");
- printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
- i);
- physid_set(i, phys_id_present_map);
+ pr_err("... fixing up to %d. (tell your hw vendor)\n", i);
+ set_bit(i, phys_id_present_map);
ioapics[ioapic_idx].mp_config.apicid = i;
} else {
- apic_printk(APIC_VERBOSE, "Setting %d in the phys_id_present_map\n",
- mpc_ioapic_id(ioapic_idx));
- physid_set(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
+ apic_pr_verbose("Setting %d in the phys_id_present_map\n",
+ mpc_ioapic_id(ioapic_idx));
+ set_bit(mpc_ioapic_id(ioapic_idx), phys_id_present_map);
}
/*
- * We need to adjust the IRQ routing table
- * if the ID changed.
+ * We need to adjust the IRQ routing table if the ID
+ * changed.
*/
- if (old_id != mpc_ioapic_id(ioapic_idx))
- for (i = 0; i < mp_irq_entries; i++)
+ if (old_id != mpc_ioapic_id(ioapic_idx)) {
+ for (i = 0; i < mp_irq_entries; i++) {
if (mp_irqs[i].dstapic == old_id)
- mp_irqs[i].dstapic
- = mpc_ioapic_id(ioapic_idx);
+ mp_irqs[i].dstapic = mpc_ioapic_id(ioapic_idx);
+ }
+ }
/*
- * Update the ID register according to the right value
- * from the MPC table if they are different.
+ * Update the ID register according to the right value from
+ * the MPC table if they are different.
*/
if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID)
continue;
- apic_printk(APIC_VERBOSE, KERN_INFO
- "...changing IO-APIC physical APIC ID to %d ...",
- mpc_ioapic_id(ioapic_idx));
+ apic_pr_verbose("...changing IO-APIC physical APIC ID to %d ...",
+ mpc_ioapic_id(ioapic_idx));
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_idx, 0, reg_00.raw);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic_idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic_idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic_idx, 0);
+ }
+ /* Sanity check */
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx))
pr_cont("could not set ID!\n");
else
- apic_printk(APIC_VERBOSE, " ok.\n");
+ apic_pr_verbose(" ok.\n");
}
}
@@ -1596,8 +1488,7 @@ static void __init delay_with_tsc(void)
do {
rep_nop();
now = rdtsc();
- } while ((now - start) < 40000000000ULL / HZ &&
- time_before_eq(jiffies, end));
+ } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end));
}
static void __init delay_without_tsc(void)
@@ -1658,36 +1549,29 @@ static int __init timer_irq_works(void)
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
*
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
+ *
+ * Edge triggered needs to resend any interrupt that was delayed but this
+ * is now handled in the device independent code.
+ *
+ * Starting up a edge-triggered IO-APIC interrupt is nasty - we need to
+ * make sure that we get the edge. If it is already asserted for some
+ * reason, we need return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake an edge even if it
+ * isn't on the 8259A...
*/
static unsigned int startup_ioapic_irq(struct irq_data *data)
{
int was_pending = 0, irq = data->irq;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (irq < nr_legacy_irqs()) {
legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
__unmask_ioapic(data->chip_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return was_pending;
}
@@ -1697,9 +1581,8 @@ atomic_t irq_mis_count;
static bool io_apic_level_ack_pending(struct mp_chip_data *data)
{
struct irq_pin_list *entry;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
for_each_irq_pin(entry, data->irq_2_pin) {
struct IO_APIC_route_entry e;
int pin;
@@ -1707,13 +1590,9 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
pin = entry->pin;
e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if (e.irr) {
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (e.irr)
return true;
- }
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-
return false;
}
@@ -1731,7 +1610,8 @@ static inline bool ioapic_prepare_move(struct irq_data *data)
static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
if (unlikely(moveit)) {
- /* Only migrate the irq if the ack has been received.
+ /*
+ * Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
* delayed going to ioapics, and if we reprogram the
@@ -1914,18 +1794,16 @@ static void ioapic_configure_entry(struct irq_data *irqd)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
-static int ioapic_set_affinity(struct irq_data *irq_data,
- const struct cpumask *mask, bool force)
+static int ioapic_set_affinity(struct irq_data *irq_data, const struct cpumask *mask, bool force)
{
struct irq_data *parent = irq_data->parent_data;
- unsigned long flags;
int ret;
ret = parent->chip->irq_set_affinity(parent, mask, force);
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE)
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
@@ -1944,9 +1822,8 @@ static int ioapic_set_affinity(struct irq_data *irq_data,
*
* Verify that the corresponding Remote-IRR bits are clear.
*/
-static int ioapic_irq_get_chip_state(struct irq_data *irqd,
- enum irqchip_irq_state which,
- bool *state)
+static int ioapic_irq_get_chip_state(struct irq_data *irqd, enum irqchip_irq_state which,
+ bool *state)
{
struct mp_chip_data *mcd = irqd->chip_data;
struct IO_APIC_route_entry rentry;
@@ -1956,7 +1833,8 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
return -EINVAL;
*state = false;
- raw_spin_lock(&ioapic_lock);
+
+ guard(raw_spinlock)(&ioapic_lock);
for_each_irq_pin(p, mcd->irq_2_pin) {
rentry = __ioapic_read_entry(p->apic, p->pin);
/*
@@ -1970,7 +1848,6 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
break;
}
}
- raw_spin_unlock(&ioapic_lock);
return 0;
}
@@ -1984,7 +1861,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_set_affinity = ioapic_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_get_irqchip_state = ioapic_irq_get_chip_state,
- .flags = IRQCHIP_SKIP_SET_WAKE |
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
IRQCHIP_AFFINITY_PRE_STARTUP,
};
@@ -2011,14 +1888,13 @@ static inline void init_IO_APIC_traps(void)
cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
- * Hmm.. We don't have an entry for this,
- * so default to an old-fashioned 8259
- * interrupt if we can..
+ * Hmm.. We don't have an entry for this, so
+ * default to an old-fashioned 8259 interrupt if we
+ * can. Otherwise set the dummy interrupt chip.
*/
if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
- /* Strange. Oh, well.. */
irq_set_chip(irq, &no_irq_chip);
}
}
@@ -2027,20 +1903,17 @@ static inline void init_IO_APIC_traps(void)
/*
* The local APIC irq-chip implementation:
*/
-
static void mask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
}
static void unmask_lapic_irq(struct irq_data *data)
{
- unsigned long v;
+ unsigned long v = apic_read(APIC_LVT0);
- v = apic_read(APIC_LVT0);
apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
@@ -2059,8 +1932,7 @@ static struct irq_chip lapic_chip __read_mostly = {
static void lapic_register_intr(int irq)
{
irq_clear_status_flags(irq, IRQ_LEVEL);
- irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
- "edge");
+ irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge");
}
/*
@@ -2072,9 +1944,9 @@ static void lapic_register_intr(int irq)
*/
static inline void __init unlock_ExtINT_logic(void)
{
- int apic, pin, i;
- struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ struct IO_APIC_route_entry entry0, entry1;
+ int apic, pin, i;
u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
@@ -2134,10 +2006,10 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-static int mp_alloc_timer_irq(int ioapic, int pin)
+static int __init mp_alloc_timer_irq(int ioapic, int pin)
{
- int irq = -1;
struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ int irq = -1;
if (domain) {
struct irq_alloc_info info;
@@ -2145,21 +2017,36 @@ static int mp_alloc_timer_irq(int ioapic, int pin)
ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
info.devid = mpc_ioapic_id(ioapic);
info.ioapic.pin = pin;
- mutex_lock(&ioapic_mutex);
+ guard(mutex)(&ioapic_mutex);
irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
- mutex_unlock(&ioapic_mutex);
}
return irq;
}
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry;
+
+ for_each_irq_pin(entry, data->irq_2_pin) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ return;
+ }
+ }
+
+ /* Old apic/pin didn't exist, so just add a new one */
+ add_pin_to_irq_node(data, node, newapic, newpin);
+}
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
- *
- * FIXME: really need to revamp this for all platforms.
*/
static inline void __init check_timer(void)
{
@@ -2197,9 +2084,8 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
- "apic1=%d pin1=%d apic2=%d pin2=%d\n",
- cfg->vector, apic1, pin1, apic2, pin2);
+ pr_info("..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
+ cfg->vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
@@ -2209,7 +2095,7 @@ static inline void __init check_timer(void)
* 8259A.
*/
if (pin1 == -1) {
- panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
+ panic_if_irq_remap(FW_BUG "Timer not connected to IO-APIC");
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
@@ -2243,13 +2129,10 @@ static inline void __init check_timer(void)
panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
- apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
- "8254 timer not connected to IO-APIC\n");
+ pr_err("..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
- "(IRQ0) through the 8259A ...\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "..... (found apic %d pin %d) ...\n", apic2, pin2);
+ pr_info("...trying to set up timer (IRQ0) through the 8259A ...\n");
+ pr_info("..... (found apic %d pin %d) ...\n", apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
@@ -2258,7 +2141,7 @@ static inline void __init check_timer(void)
irq_domain_activate_irq(irq_data, false);
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
+ pr_info("....... works.\n");
goto out;
}
/*
@@ -2266,26 +2149,24 @@ static inline void __init check_timer(void)
*/
legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
- apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
+ pr_info("....... failed.\n");
}
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as Virtual Wire IRQ...\n");
+ pr_info("...trying to set up timer as Virtual Wire IRQ...\n");
lapic_register_intr(0);
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
legacy_pic->unmask(0);
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
- apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
+ pr_info("..... failed.\n");
- apic_printk(APIC_QUIET, KERN_INFO
- "...trying to set up timer as ExtINT IRQ...\n");
+ pr_info("...trying to set up timer as ExtINT IRQ...\n");
legacy_pic->init(0);
legacy_pic->make_irq(0);
@@ -2295,14 +2176,15 @@ static inline void __init check_timer(void)
unlock_ExtINT_logic();
if (timer_irq_works()) {
- apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
+ pr_info("..... works.\n");
goto out;
}
- apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- if (apic_is_x2apic_enabled())
- apic_printk(APIC_QUIET, KERN_INFO
- "Perhaps problem with the pre-enabled x2apic mode\n"
- "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+
+ pr_info("..... failed :\n");
+ if (apic_is_x2apic_enabled()) {
+ pr_info("Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
+ }
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
@@ -2330,11 +2212,11 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- struct irq_domain *parent;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
- struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ struct irq_domain *parent;
struct fwnode_handle *fn;
struct irq_fwspec fwspec;
@@ -2354,7 +2236,7 @@ static int mp_irqdomain_create(int ioapic)
fwspec.param_count = 1;
fwspec.param[0] = mpc_ioapic_id(ioapic);
- parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
if (!parent) {
if (!cfg->dev)
irq_domain_free_fwnode(fn);
@@ -2370,10 +2252,8 @@ static int mp_irqdomain_create(int ioapic)
return -ENOMEM;
}
- if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
- cfg->type == IOAPIC_DOMAIN_STRICT)
- ioapic_dynirq_base = max(ioapic_dynirq_base,
- gsi_cfg->gsi_end + 1);
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY || cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base, gsi_cfg->gsi_end + 1);
return 0;
}
@@ -2400,13 +2280,11 @@ void __init setup_IO_APIC(void)
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
- apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ apic_pr_verbose("ENABLING IO-APIC IRQs\n");
for_each_ioapic(ioapic)
BUG_ON(mp_irqdomain_create(ioapic));
- /*
- * Set up IO-APIC IRQ routing.
- */
+ /* Set up IO-APIC IRQ routing. */
x86_init.mpparse.setup_ioapic_ids();
sync_Arb_IDs();
@@ -2420,16 +2298,14 @@ void __init setup_IO_APIC(void)
static void resume_ioapic_id(int ioapic_idx)
{
- unsigned long flags;
union IO_APIC_reg_00 reg_00;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_00.raw = io_apic_read(ioapic_idx, 0);
if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) {
reg_00.bits.ID = mpc_ioapic_id(ioapic_idx);
io_apic_write(ioapic_idx, 0, reg_00.raw);
}
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void ioapic_resume(void)
@@ -2443,8 +2319,8 @@ static void ioapic_resume(void)
}
static struct syscore_ops ioapic_syscore_ops = {
- .suspend = save_ioapic_entries,
- .resume = ioapic_resume,
+ .suspend = save_ioapic_entries,
+ .resume = ioapic_resume,
};
static int __init ioapic_init_ops(void)
@@ -2459,15 +2335,13 @@ device_initcall(ioapic_init_ops);
static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
- unsigned long flags;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- /* The register returns the maximum index redir index
- * supported, which is one less than the total number of redir
- * entries.
+ /*
+ * The register returns the maximum index redir index supported,
+ * which is one less than the total number of redir entries.
*/
return reg_01.bits.entries + 1;
}
@@ -2494,93 +2368,71 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
#ifdef CONFIG_X86_32
static int io_apic_get_unique_id(int ioapic, int apic_id)
{
+ static DECLARE_BITMAP(apic_id_map, MAX_LOCAL_APIC);
+ const u32 broadcast_id = 0xF;
union IO_APIC_reg_00 reg_00;
- static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
- physid_mask_t tmp;
- unsigned long flags;
int i = 0;
- /*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
- * supports up to 16 on one shared APIC bus.
- *
- * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
- * advantage of new APIC bus architecture.
- */
+ /* Initialize the ID map */
+ if (bitmap_empty(apic_id_map, MAX_LOCAL_APIC))
+ copy_phys_cpu_present_map(apic_id_map);
- if (physids_empty(apic_id_map))
- apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(ioapic, 0);
- if (apic_id >= get_physical_broadcast()) {
- printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
- "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ if (apic_id >= broadcast_id) {
+ pr_warn("IOAPIC[%d]: Invalid apic_id %d, trying %d\n",
+ ioapic, apic_id, reg_00.bits.ID);
apic_id = reg_00.bits.ID;
}
- /*
- * Every APIC in a system must have a unique ID or we get lots of nice
- * 'stuck on smp_invalidate_needed IPI wait' messages.
- */
- if (apic->check_apicid_used(&apic_id_map, apic_id)) {
-
- for (i = 0; i < get_physical_broadcast(); i++) {
- if (!apic->check_apicid_used(&apic_id_map, i))
+ /* Every APIC in a system must have a unique ID */
+ if (test_bit(apic_id, apic_id_map)) {
+ for (i = 0; i < broadcast_id; i++) {
+ if (!test_bit(i, apic_id_map))
break;
}
- if (i == get_physical_broadcast())
+ if (i == broadcast_id)
panic("Max apic_id exceeded!\n");
- printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
- "trying %d\n", ioapic, apic_id, i);
-
+ pr_warn("IOAPIC[%d]: apic_id %d already used, trying %d\n", ioapic, apic_id, i);
apic_id = i;
}
- physid_set_mask_of_physid(apic_id, &tmp);
- physids_or(apic_id_map, apic_id_map, tmp);
+ set_bit(apic_id, apic_id_map);
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0, reg_00.raw);
- reg_00.raw = io_apic_read(ioapic, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ }
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
- pr_err("IOAPIC[%d]: Unable to change apic_id!\n",
- ioapic);
+ pr_err("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
}
}
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+ apic_pr_verbose("IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
return apic_id;
}
static u8 io_apic_unique_id(int idx, u8 id)
{
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(boot_cpu_apic_version))
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && !APIC_XAPIC(boot_cpu_apic_version))
return io_apic_get_unique_id(idx, id);
- else
- return id;
+ return id;
}
#else
static u8 io_apic_unique_id(int idx, u8 id)
{
union IO_APIC_reg_00 reg_00;
DECLARE_BITMAP(used, 256);
- unsigned long flags;
u8 new_id;
int i;
@@ -2596,26 +2448,23 @@ static u8 io_apic_unique_id(int idx, u8 id)
* Read the current id from the ioapic and keep it if
* available.
*/
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock)
+ reg_00.raw = io_apic_read(idx, 0);
+
new_id = reg_00.bits.ID;
if (!test_bit(new_id, used)) {
- apic_printk(APIC_VERBOSE, KERN_INFO
- "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
- idx, new_id, id);
+ apic_pr_verbose("IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
return new_id;
}
- /*
- * Get the next free id and write it to the ioapic.
- */
+ /* Get the next free id and write it to the ioapic. */
new_id = find_first_zero_bit(used, 256);
reg_00.bits.ID = new_id;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(idx, 0, reg_00.raw);
- reg_00.raw = io_apic_read(idx, 0);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ scoped_guard (raw_spinlock_irqsave, &ioapic_lock) {
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ }
/* Sanity check */
BUG_ON(reg_00.bits.ID != new_id);
@@ -2625,12 +2474,10 @@ static u8 io_apic_unique_id(int idx, u8 id)
static int io_apic_get_version(int ioapic)
{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
+ union IO_APIC_reg_01 reg_01;
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
reg_01.raw = io_apic_read(ioapic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
@@ -2645,8 +2492,8 @@ static struct resource *ioapic_resources;
static struct resource * __init ioapic_setup_resources(void)
{
- unsigned long n;
struct resource *res;
+ unsigned long n;
char *mem;
int i;
@@ -2656,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void)
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
n *= nr_ioapics;
- mem = memblock_alloc(n, SMP_CACHE_BYTES);
- if (!mem)
- panic("%s: Failed to allocate %lu bytes\n", __func__, n);
+ mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES);
res = (void *)mem;
mem += sizeof(struct resource) * nr_ioapics;
@@ -2706,9 +2551,7 @@ void __init io_apic_init_mappings(void)
ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
if (!ioapic_phys) {
- printk(KERN_ERR
- "WARNING: bogus zero IO-APIC "
- "address found in MPTABLE, "
+ pr_err("WARNING: bogus zero IO-APIC address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
ioapic_is_disabled = true;
@@ -2719,17 +2562,13 @@ void __init io_apic_init_mappings(void)
#ifdef CONFIG_X86_32
fake_ioapic_page:
#endif
- ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE,
+ ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE,
PAGE_SIZE);
- if (!ioapic_phys)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, PAGE_SIZE, PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
io_apic_set_fixmap(idx, ioapic_phys);
- apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
- __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
- ioapic_phys);
+ apic_pr_verbose("mapped IOAPIC to %08lx (%08lx)\n",
+ __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), ioapic_phys);
idx++;
ioapic_res->start = ioapic_phys;
@@ -2740,13 +2579,12 @@ fake_ioapic_page:
void __init ioapic_insert_resources(void)
{
- int i;
struct resource *r = ioapic_resources;
+ int i;
if (!r) {
if (nr_ioapics > 0)
- printk(KERN_ERR
- "IO APIC resources couldn't be allocated.\n");
+ pr_err("IO APIC resources couldn't be allocated.\n");
return;
}
@@ -2766,11 +2604,12 @@ int mp_find_ioapic(u32 gsi)
/* Find the IOAPIC that manages this GSI. */
for_each_ioapic(i) {
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+
if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ pr_err("ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
return -1;
}
@@ -2809,12 +2648,10 @@ static int bad_ioapic_register(int idx)
static int find_free_ioapic_entry(void)
{
- int idx;
-
- for (idx = 0; idx < MAX_IO_APICS; idx++)
+ for (int idx = 0; idx < MAX_IO_APICS; idx++) {
if (ioapics[idx].nr_registers == 0)
return idx;
-
+ }
return MAX_IO_APICS;
}
@@ -2825,8 +2662,7 @@ static int find_free_ioapic_entry(void)
* @gsi_base: base of GSI associated with the IOAPIC
* @cfg: configuration information for the IOAPIC
*/
-int mp_register_ioapic(int id, u32 address, u32 gsi_base,
- struct ioapic_domain_cfg *cfg)
+int mp_register_ioapic(int id, u32 address, u32 gsi_base, struct ioapic_domain_cfg *cfg)
{
bool hotplug = !!ioapic_initialized;
struct mp_ioapic_gsi *gsi_cfg;
@@ -2837,12 +2673,13 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
return -EINVAL;
}
- for_each_ioapic(ioapic)
+
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].mp_config.apicaddr == address) {
- pr_warn("address 0x%x conflicts with IOAPIC%d\n",
- address, ioapic);
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n", address, ioapic);
return -EEXIST;
}
+ }
idx = find_free_ioapic_entry();
if (idx >= MAX_IO_APICS) {
@@ -2877,8 +2714,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
(gsi_end >= gsi_cfg->gsi_base &&
gsi_end <= gsi_cfg->gsi_end)) {
pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
- gsi_base, gsi_end,
- gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ gsi_base, gsi_end, gsi_cfg->gsi_base, gsi_cfg->gsi_end);
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
return -ENOSPC;
}
@@ -2912,8 +2748,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
ioapics[idx].nr_registers = entries;
pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
- idx, mpc_ioapic_id(idx),
- mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+ idx, mpc_ioapic_id(idx), mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
gsi_cfg->gsi_base, gsi_cfg->gsi_end);
return 0;
@@ -2924,11 +2759,13 @@ int mp_unregister_ioapic(u32 gsi_base)
int ioapic, pin;
int found = 0;
- for_each_ioapic(ioapic)
+ for_each_ioapic(ioapic) {
if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
found = 1;
break;
}
+ }
+
if (!found) {
pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
return -ENODEV;
@@ -2942,8 +2779,7 @@ int mp_unregister_ioapic(u32 gsi_base)
if (irq >= 0) {
data = irq_get_chip_data(irq);
if (data && data->count) {
- pr_warn("pin%d on IOAPIC%d is still in use.\n",
- pin, ioapic);
+ pr_warn("pin%d on IOAPIC%d is still in use.\n", pin, ioapic);
return -EBUSY;
}
}
@@ -2978,8 +2814,7 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
if (info && info->ioapic.valid) {
data->is_level = info->ioapic.is_level;
data->active_low = info->ioapic.active_low;
- } else if (__acpi_get_override_irq(gsi, &data->is_level,
- &data->active_low) < 0) {
+ } else if (__acpi_get_override_irq(gsi, &data->is_level, &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
data->is_level = true;
data->active_low = true;
@@ -3037,10 +2872,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
return -ENOMEM;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
- if (ret < 0) {
- kfree(data);
- return ret;
- }
+ if (ret < 0)
+ goto free_data;
INIT_LIST_HEAD(&data->irq_2_pin);
irq_data->hwirq = info->ioapic.pin;
@@ -3049,7 +2882,10 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ if (!add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin)) {
+ ret = -ENOMEM;
+ goto free_irqs;
+ }
mp_preconfigure_entry(data);
mp_register_handler(virq, data->is_level);
@@ -3059,11 +2895,15 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
legacy_pic->mask(virq);
local_irq_restore(flags);
- apic_printk(APIC_VERBOSE, KERN_DEBUG
- "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
- ioapic, mpc_ioapic_id(ioapic), pin, virq,
- data->is_level, data->active_low);
+ apic_pr_verbose("IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq, data->is_level, data->active_low);
return 0;
+
+free_irqs:
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+free_data:
+ kfree(data);
+ return ret;
}
void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
@@ -3076,22 +2916,17 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
irq_data = irq_domain_get_irq_data(domain, virq);
if (irq_data && irq_data->chip_data) {
data = irq_data->chip_data;
- __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ __remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
WARN_ON(!list_empty(&data->irq_2_pin));
kfree(irq_data->chip_data);
}
irq_domain_free_irqs_top(domain, virq, nr_irqs);
}
-int mp_irqdomain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool reserve)
+int mp_irqdomain_activate(struct irq_domain *domain, struct irq_data *irq_data, bool reserve)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ioapic_lock, flags);
+ guard(raw_spinlock_irqsave)(&ioapic_lock);
ioapic_configure_entry(irq_data);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
}
@@ -3099,8 +2934,7 @@ void mp_irqdomain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data)
{
/* It won't be called for IRQ with multiple IOAPIC pins associated */
- ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
- (int)irq_data->hwirq);
+ ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain), (int)irq_data->hwirq);
}
int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h
index 9ea6186ea88c..842fe28496be 100644
--- a/arch/x86/kernel/apic/local.h
+++ b/arch/x86/kernel/apic/local.h
@@ -16,8 +16,6 @@
/* X2APIC */
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest);
u32 x2apic_get_apic_id(u32 id);
-u32 x2apic_set_apic_id(u32 id);
-u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb);
void x2apic_send_IPI_all(int vector);
void x2apic_send_IPI_allbutself(int vector);
@@ -63,9 +61,6 @@ void default_send_IPI_allbutself(int vector);
void default_send_IPI_all(int vector);
void default_send_IPI_self(int vector);
-bool default_apic_id_registered(void);
-bool default_check_apicid_used(physid_mask_t *map, u32 apicid);
-
#ifdef CONFIG_X86_32
void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector);
void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector);
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index d9651f15ae4f..66bc5d3e79db 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -184,7 +184,6 @@ static int x86_msi_prepare(struct irq_domain *domain, struct device *dev,
alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSI;
return 0;
case DOMAIN_BUS_PCI_DEVICE_MSIX:
- case DOMAIN_BUS_PCI_DEVICE_IMS:
alloc->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX;
return 0;
default:
@@ -215,6 +214,7 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
if (WARN_ON_ONCE(domain != real_parent))
return false;
info->chip->irq_set_affinity = msi_set_affinity;
+ info->chip->flags |= IRQCHIP_MOVE_DEFERRED;
break;
case DOMAIN_BUS_DMAR:
case DOMAIN_BUS_AMDVI:
@@ -229,10 +229,6 @@ static bool x86_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
case DOMAIN_BUS_PCI_DEVICE_MSI:
case DOMAIN_BUS_PCI_DEVICE_MSIX:
break;
- case DOMAIN_BUS_PCI_DEVICE_IMS:
- if (!(pops->supported_flags & MSI_FLAG_PCI_IMS))
- return false;
- break;
default:
WARN_ON_ONCE(1);
return false;
@@ -320,7 +316,7 @@ static struct irq_chip dmar_msi_controller = {
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE |
+ .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED |
IRQCHIP_AFFINITY_PRE_STARTUP,
};
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 5eb3fbe472da..f75ee345c02d 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -18,11 +18,6 @@
#include "local.h"
-static u32 default_phys_pkg_id(u32 cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
static u32 default_get_apic_id(u32 x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
@@ -43,18 +38,13 @@ static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
- .apic_id_registered = default_apic_id_registered,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
- .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = default_init_apic_ldr,
- .ioapic_phys_id_map = default_ioapic_phys_id_map,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = default_phys_pkg_id,
.max_apic_id = 0xFE,
.get_apic_id = default_get_apic_id,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 319448d87b99..736f62812f5c 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -712,8 +712,8 @@ int __init arch_probe_nr_irqs(void)
{
int nr;
- if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
- nr_irqs = NR_VECTORS * nr_cpu_ids;
+ if (irq_get_nr_irqs() > NR_VECTORS * nr_cpu_ids)
+ irq_set_nr_irqs(NR_VECTORS * nr_cpu_ids);
nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
#if defined(CONFIG_PCI_MSI)
@@ -725,8 +725,8 @@ int __init arch_probe_nr_irqs(void)
else
nr += gsi_top * 16;
#endif
- if (nr < nr_irqs)
- nr_irqs = nr;
+ if (nr < irq_get_nr_irqs())
+ irq_set_nr_irqs(nr);
/*
* We don't know if PIC is present at this point so we need to do
@@ -738,8 +738,8 @@ int __init arch_probe_nr_irqs(void)
void lapic_assign_legacy_vector(unsigned int irq, bool replace)
{
/*
- * Use assign system here so it wont get accounted as allocated
- * and moveable in the cpu hotplug check and it prevents managed
+ * Use assign system here so it won't get accounted as allocated
+ * and movable in the cpu hotplug check and it prevents managed
* irq reservation from touching it.
*/
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
@@ -965,7 +965,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
lockdep_assert_held(&vector_lock);
hlist_for_each_entry_safe(apicd, tmp, &cl->head, clist) {
- unsigned int irr, vector = apicd->prev_vector;
+ unsigned int vector = apicd->prev_vector;
/*
* Paranoia: Check if the vector that needs to be cleaned
@@ -979,8 +979,7 @@ static void __vector_cleanup(struct vector_cleanup *cl, bool check_irr)
* fixup_irqs() was just called to scan IRR for set bits and
* forward them to new destination CPUs via IPIs.
*/
- irr = check_irr ? apic_read(APIC_IRR + (vector / 32 * 0x10)) : 0;
- if (irr & (1U << (vector % 32))) {
+ if (check_irr && is_vector_pending(vector)) {
pr_warn_once("Moved interrupt pending in old target APIC %u\n", apicd->irq);
rearm = true;
continue;
@@ -1036,7 +1035,8 @@ static void __vector_schedule_cleanup(struct apic_chip_data *apicd)
add_timer_on(&cl->timer, cpu);
}
} else {
- apicd->prev_vector = 0;
+ pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", apicd->irq, cpu);
+ free_moved_vector(apicd);
}
raw_spin_unlock(&vector_lock);
}
@@ -1073,6 +1073,7 @@ void irq_complete_move(struct irq_cfg *cfg)
*/
void irq_force_complete_move(struct irq_desc *desc)
{
+ unsigned int cpu = smp_processor_id();
struct apic_chip_data *apicd;
struct irq_data *irqd;
unsigned int vector;
@@ -1097,10 +1098,11 @@ void irq_force_complete_move(struct irq_desc *desc)
goto unlock;
/*
- * If prev_vector is empty, no action required.
+ * If prev_vector is empty or the descriptor is neither currently
+ * nor previously on the outgoing CPU no action required.
*/
vector = apicd->prev_vector;
- if (!vector)
+ if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
goto unlock;
/*
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a8306089c91b..7db83212effb 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -178,13 +178,16 @@ static int x2apic_prepare_cpu(unsigned int cpu)
u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
u32 cluster = apic_cluster(phys_apicid);
u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+ int node = cpu_to_node(cpu);
x86_cpu_to_logical_apicid[cpu] = logical_apicid;
- if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
+ if (alloc_clustermask(cpu, cluster, node) < 0)
return -ENOMEM;
- if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
+
+ if (!zalloc_cpumask_var_node(&per_cpu(ipi_mask, cpu), GFP_KERNEL, node))
return -ENOMEM;
+
return 0;
}
@@ -227,21 +230,16 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.probe = x2apic_cluster_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = true,
.disable_esr = 0,
- .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
- .ioapic_phys_id_map = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = x2apic_phys_pkg_id,
.max_apic_id = UINT_MAX,
.x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = x2apic_set_apic_id,
.calc_dest_apicid = x2apic_calc_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 558a4a8824f4..12d4c35547a6 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -129,34 +129,21 @@ u32 x2apic_get_apic_id(u32 id)
return id;
}
-u32 x2apic_set_apic_id(u32 id)
-{
- return id;
-}
-
-u32 x2apic_phys_pkg_id(u32 initial_apicid, int index_msb)
-{
- return initial_apicid >> index_msb;
-}
-
static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = x2apic_phys_pkg_id,
.max_apic_id = UINT_MAX,
.x2apic_set_max_apicid = true,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = x2apic_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1b0d7336a28f..7fef504ca508 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -241,54 +241,20 @@ static void __init uv_tsc_check_sync(void)
is_uv(UV3) ? sname.s3.field : \
undef)
-/* [Copied from arch/x86/kernel/cpu/topology.c:detect_extended_topology()] */
-
-#define SMT_LEVEL 0 /* Leaf 0xb SMT level */
-#define INVALID_TYPE 0 /* Leaf 0xb sub-leaf types */
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-
-static void set_x2apic_bits(void)
-{
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int sid_shift;
-
- cpuid(0, &eax, &ebx, &ecx, &edx);
- if (eax < 0xb) {
- pr_info("UV: CPU does not have CPUID.11\n");
- return;
- }
-
- cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) {
- pr_info("UV: CPUID.11 not implemented\n");
- return;
- }
-
- sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
- sub_index = 1;
- do {
- cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- sid_shift = BITS_SHIFT_NEXT_LEVEL(eax);
- break;
- }
- sub_index++;
- } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
-
- uv_cpuid.apicid_shift = 0;
- uv_cpuid.apicid_mask = (~(-1 << sid_shift));
- uv_cpuid.socketid_shift = sid_shift;
-}
-
static void __init early_get_apic_socketid_shift(void)
{
+ unsigned int sid_shift = topology_get_domain_shift(TOPO_PKG_DOMAIN);
+
if (is_uv2_hub() || is_uv3_hub())
uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
- set_x2apic_bits();
+ if (sid_shift) {
+ uv_cpuid.apicid_shift = 0;
+ uv_cpuid.apicid_mask = (~(-1 << sid_shift));
+ uv_cpuid.socketid_shift = sid_shift;
+ } else {
+ pr_info("UV: CPU does not have valid CPUID.11\n");
+ }
pr_info("UV: apicid_shift:%d apicid_mask:0x%x\n", uv_cpuid.apicid_shift, uv_cpuid.apicid_mask);
pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask);
@@ -779,21 +745,6 @@ static void uv_send_IPI_all(int vector)
uv_send_IPI_mask(cpu_online_mask, vector);
}
-static u32 set_apic_id(u32 id)
-{
- return id;
-}
-
-static unsigned int uv_read_apic_id(void)
-{
- return x2apic_get_apic_id(apic_read(APIC_ID));
-}
-
-static u32 uv_phys_pkg_id(u32 initial_apicid, int index_msb)
-{
- return uv_read_apic_id() >> index_msb;
-}
-
static int uv_probe(void)
{
return apic == &apic_x2apic_uv_x;
@@ -805,17 +756,14 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.probe = uv_probe,
.acpi_madt_oem_check = uv_acpi_madt_oem_check,
- .delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
- .phys_pkg_id = uv_phys_pkg_id,
.max_apic_id = UINT_MAX,
.get_apic_id = x2apic_get_apic_id,
- .set_apic_id = set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5934ee5bc087..b37ab1095707 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -420,7 +420,7 @@ static DEFINE_MUTEX(apm_mutex);
* This is for buggy BIOS's that refer to (real mode) segment 0x40
* even though they are called in protected mode.
*/
-static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092,
+static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(DESC_DATA32_BIOS,
(unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1);
static const char driver_version[] = "1.16ac"; /* no spaces */
@@ -1055,35 +1055,6 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
return APM_SUCCESS;
}
-#if 0
-static int apm_get_battery_status(u_short which, u_short *status,
- u_short *bat, u_short *life, u_short *nbat)
-{
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u32 esi;
-
- if (apm_info.connection_version < 0x0102) {
- /* pretend we only have one battery. */
- if (which != 1)
- return APM_BAD_DEVICE;
- *nbat = 1;
- return apm_get_power_status(status, bat, life);
- }
-
- if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
- &ebx, &ecx, &edx, &esi))
- return (eax >> 8) & 0xff;
- *status = ebx;
- *bat = ecx;
- *life = edx;
- *nbat = esi;
- return APM_SUCCESS;
-}
-#endif
-
/**
* apm_engage_power_management - enable PM on a device
* @device: identity of device
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 6913b372ccf7..a98020bf31bb 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -109,7 +109,7 @@ static void __used common(void)
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
OFFSET(X86_current_task, pcpu_hot, current_task);
-#ifdef CONFIG_CALL_DEPTH_TRACKING
+#ifdef CONFIG_MITIGATION_CALL_DEPTH_TRACKING
OFFSET(X86_call_depth, pcpu_hot, call_depth);
#endif
#if IS_ENABLED(CONFIG_CRYPTO_ARIA_AESNI_AVX_X86_64)
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c
index e9ad518a5003..8418a892d195 100644
--- a/arch/x86/kernel/callthunks.c
+++ b/arch/x86/kernel/callthunks.c
@@ -24,6 +24,8 @@
static int __initdata_or_module debug_callthunks;
+#define MAX_PATCH_LEN (255-1)
+
#define prdbg(fmt, args...) \
do { \
if (debug_callthunks) \
@@ -42,8 +44,8 @@ DEFINE_PER_CPU(u64, __x86_call_count);
DEFINE_PER_CPU(u64, __x86_ret_count);
DEFINE_PER_CPU(u64, __x86_stuffs_count);
DEFINE_PER_CPU(u64, __x86_ctxsw_count);
-EXPORT_SYMBOL_GPL(__x86_ctxsw_count);
-EXPORT_SYMBOL_GPL(__x86_call_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_ctxsw_count);
+EXPORT_PER_CPU_SYMBOL_GPL(__x86_call_count);
#endif
extern s32 __call_sites[], __call_sites_end[];
@@ -137,14 +139,15 @@ static bool skip_addr(void *dest)
return true;
#endif
#ifdef CONFIG_KEXEC_CORE
+# ifdef CONFIG_X86_64
+ if (dest >= (void *)__relocate_kernel_start &&
+ dest < (void *)__relocate_kernel_end)
+ return true;
+# else
if (dest >= (void *)relocate_kernel &&
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
return true;
-#endif
-#ifdef CONFIG_XEN
- if (dest >= (void *)hypercall_page &&
- dest < (void*)hypercall_page + PAGE_SIZE)
- return true;
+# endif
#endif
return false;
}
@@ -179,10 +182,14 @@ static const u8 nops[] = {
static void *patch_dest(void *dest, bool direct)
{
unsigned int tsize = SKL_TMPL_SIZE;
+ u8 insn_buff[MAX_PATCH_LEN];
u8 *pad = dest - tsize;
+ memcpy(insn_buff, skl_call_thunk_template, tsize);
+ apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize);
+
/* Already patched? */
- if (!bcmp(pad, skl_call_thunk_template, tsize))
+ if (!bcmp(pad, insn_buff, tsize))
return pad;
/* Ensure there are nops */
@@ -192,9 +199,9 @@ static void *patch_dest(void *dest, bool direct)
}
if (direct)
- memcpy(pad, skl_call_thunk_template, tsize);
+ memcpy(pad, insn_buff, tsize);
else
- text_poke_copy_locked(pad, skl_call_thunk_template, tsize, true);
+ text_poke_copy_locked(pad, insn_buff, tsize, true);
return pad;
}
@@ -233,14 +240,13 @@ patch_call_sites(s32 *start, s32 *end, const struct core_text *ct)
}
static __init_or_module void
-patch_paravirt_call_sites(struct paravirt_patch_site *start,
- struct paravirt_patch_site *end,
- const struct core_text *ct)
+patch_alt_call_sites(struct alt_instr *start, struct alt_instr *end,
+ const struct core_text *ct)
{
- struct paravirt_patch_site *p;
+ struct alt_instr *a;
- for (p = start; p < end; p++)
- patch_call(p->instr, ct);
+ for (a = start; a < end; a++)
+ patch_call((void *)&a->instr_offset + a->instr_offset, ct);
}
static __init_or_module void
@@ -248,7 +254,7 @@ callthunks_setup(struct callthunk_sites *cs, const struct core_text *ct)
{
prdbg("Patching call sites %s\n", ct->name);
patch_call_sites(cs->call_start, cs->call_end, ct);
- patch_paravirt_call_sites(cs->pv_start, cs->pv_end, ct);
+ patch_alt_call_sites(cs->alt_start, cs->alt_end, ct);
prdbg("Patching call sites done%s\n", ct->name);
}
@@ -257,8 +263,8 @@ void __init callthunks_patch_builtin_calls(void)
struct callthunk_sites cs = {
.call_start = __call_sites,
.call_end = __call_sites_end,
- .pv_start = __parainstructions,
- .pv_end = __parainstructions_end
+ .alt_start = __alt_instructions,
+ .alt_end = __alt_instructions_end
};
if (!cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
@@ -291,20 +297,26 @@ void *callthunks_translate_call_dest(void *dest)
static bool is_callthunk(void *addr)
{
unsigned int tmpl_size = SKL_TMPL_SIZE;
- void *tmpl = skl_call_thunk_template;
+ u8 insn_buff[MAX_PATCH_LEN];
unsigned long dest;
+ u8 *pad;
dest = roundup((unsigned long)addr, CONFIG_FUNCTION_ALIGNMENT);
if (!thunks_initialized || skip_addr((void *)dest))
return false;
- return !bcmp((void *)(dest - tmpl_size), tmpl, tmpl_size);
+ pad = (void *)(dest - tmpl_size);
+
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ return !bcmp(pad, insn_buff, tmpl_size);
}
-int x86_call_depth_emit_accounting(u8 **pprog, void *func)
+int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip)
{
unsigned int tmpl_size = SKL_TMPL_SIZE;
- void *tmpl = skl_call_thunk_template;
+ u8 insn_buff[MAX_PATCH_LEN];
if (!thunks_initialized)
return 0;
@@ -313,7 +325,10 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func)
if (func && is_callthunk(func))
return 0;
- memcpy(*pprog, tmpl, tmpl_size);
+ memcpy(insn_buff, skl_call_thunk_template, tmpl_size);
+ apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size);
+
+ memcpy(*pprog, insn_buff, tmpl_size);
*pprog += tmpl_size;
return tmpl_size;
}
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index d2c732a34e5d..303bf74d175b 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -81,6 +81,34 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code)
static __ro_after_init bool ibt_fatal = true;
+/*
+ * By definition, all missing-ENDBRANCH #CPs are a result of WFE && !ENDBR.
+ *
+ * For the kernel IBT no ENDBR selftest where #CPs are deliberately triggered,
+ * the WFE state of the interrupted context needs to be cleared to let execution
+ * continue. Otherwise when the CPU resumes from the instruction that just
+ * caused the previous #CP, another missing-ENDBRANCH #CP is raised and the CPU
+ * enters a dead loop.
+ *
+ * This is not a problem with IDT because it doesn't preserve WFE and IRET doesn't
+ * set WFE. But FRED provides space on the entry stack (in an expanded CS area)
+ * to save and restore the WFE state, thus the WFE state is no longer clobbered,
+ * so software must clear it.
+ */
+static void ibt_clear_fred_wfe(struct pt_regs *regs)
+{
+ /*
+ * No need to do any FRED checks.
+ *
+ * For IDT event delivery, the high-order 48 bits of CS are pushed
+ * as 0s into the stack, and later IRET ignores these bits.
+ *
+ * For FRED, a test to check if fred_cs.wfe is set would be dropped
+ * by compilers.
+ */
+ regs->fred_cs.wfe = 0;
+}
+
static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
{
if ((error_code & CP_EC) != CP_ENDBR) {
@@ -90,6 +118,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
if (unlikely(regs->ip == (unsigned long)&ibt_selftest_noendbr)) {
regs->ax = 0;
+ ibt_clear_fred_wfe(regs);
return;
}
@@ -97,6 +126,7 @@ static void do_kernel_cp_fault(struct pt_regs *regs, unsigned long error_code)
if (!ibt_fatal) {
printk(KERN_DEFAULT CUT_HERE);
__warn(__FILE__, __LINE__, (void *)regs->ip, TAINT_WARN, regs, NULL);
+ ibt_clear_fred_wfe(regs);
return;
}
BUG();
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
index 8674a5c0c031..e6bf78fac146 100644
--- a/arch/x86/kernel/cfi.c
+++ b/arch/x86/kernel/cfi.c
@@ -4,10 +4,10 @@
*
* Copyright (C) 2022 Google LLC
*/
-#include <asm/cfi.h>
+#include <linux/string.h>
+#include <linux/cfi.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
-#include <linux/string.h>
/*
* Returns the target address and the expected type when regs->ip points
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 93eabf544031..4efdf5c2efc8 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -17,7 +17,8 @@ KMSAN_SANITIZE_common.o := n
# As above, instrumenting secondary CPU boot code causes boot hangs.
KCSAN_SANITIZE_common.o := n
-obj-y := cacheinfo.o scattered.o topology.o
+obj-y := cacheinfo.o scattered.o
+obj-y += topology_common.o topology_ext.o topology_amd.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
@@ -25,14 +26,16 @@ obj-y += bugs.o
obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-y += umwait.o
+obj-y += capflags.o powerflags.o
-obj-$(CONFIG_PROC_FS) += proc.o
-obj-y += capflags.o powerflags.o
+obj-$(CONFIG_X86_LOCAL_APIC) += topology.o
-obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
+obj-$(CONFIG_PROC_FS) += proc.o
+
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
-obj-y += intel.o intel_pconfig.o tsx.o
-obj-$(CONFIG_PM) += intel_epb.o
+obj-y += intel.o tsx.o
+obj-$(CONFIG_PM) += intel_epb.o
endif
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
@@ -56,8 +59,10 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
obj-$(CONFIG_DEBUG_FS) += debugfs.o
+obj-$(CONFIG_X86_BUS_LOCK_DETECT) += bus_lock.o
+
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
+ cmd_mkcapflags = $(CONFIG_SHELL) $(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
vmxfeature = $(src)/../../include/asm/vmxfeatures.h
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index bfeb18fad63f..2c5b51aad91a 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -26,8 +26,8 @@ static u32 __init acrn_detect(void)
static void __init acrn_init_platform(void)
{
- /* Setup the IDT for ACRN hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
+ /* Install system interrupt handler for ACRN hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback);
x86_platform.calibrate_tsc = acrn_get_tsc_khz;
x86_platform.calibrate_cpu = acrn_get_tsc_khz;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f322ebd053a9..54194f5995de 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -13,6 +13,7 @@
#include <asm/apic.h>
#include <asm/cacheinfo.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/spec-ctrl.h>
#include <asm/smp.h>
#include <asm/numa.h>
@@ -20,6 +21,7 @@
#include <asm/delay.h>
#include <asm/debugreg.h>
#include <asm/resctrl.h>
+#include <asm/sev.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -27,94 +29,6 @@
#include "cpu.h"
-/*
- * nodes_per_socket: Stores the number of nodes per socket.
- * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
- * Node Identifiers[10:8]
- */
-static u32 nodes_per_socket = 1;
-
-/*
- * AMD errata checking
- *
- * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
- * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
- * have an OSVW id assigned, which it takes as first argument. Both take a
- * variable number of family-specific model-stepping ranges created by
- * AMD_MODEL_RANGE().
- *
- * Example:
- *
- * const int amd_erratum_319[] =
- * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
- * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
- * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
- */
-
-#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 }
-#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 }
-#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \
- ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end))
-#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff)
-#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff)
-#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff)
-
-static const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
- AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
-
-static const int amd_erratum_383[] =
- AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
-
-/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
-static const int amd_erratum_1054[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
-
-static const int amd_zenbleed[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf),
- AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf));
-
-static const int amd_div0[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf),
- AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf));
-
-static const int amd_erratum_1485[] =
- AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf),
- AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf));
-
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
-{
- int osvw_id = *erratum++;
- u32 range;
- u32 ms;
-
- if (osvw_id >= 0 && osvw_id < 65536 &&
- cpu_has(cpu, X86_FEATURE_OSVW)) {
- u64 osvw_len;
-
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
- if (osvw_id < osvw_len) {
- u64 osvw_bits;
-
- rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
- osvw_bits);
- return osvw_bits & (1ULL << (osvw_id & 0x3f));
- }
- }
-
- /* OSVW unavailable or ID unknown, match family-model-stepping range */
- ms = (cpu->x86_model << 4) | cpu->x86_stepping;
- while ((range = *erratum++))
- if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
- (ms >= AMD_MODEL_RANGE_START(range)) &&
- (ms <= AMD_MODEL_RANGE_END(range)))
- return true;
-
- return false;
-}
-
static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
{
u32 gprs[8] = { 0 };
@@ -381,97 +295,6 @@ static int nearby_node(int apicid)
}
#endif
-/*
- * Fix up topo::core_id for pre-F17h systems to be in the
- * [0 .. cores_per_node - 1] range. Not really needed but
- * kept so as not to break existing setups.
- */
-static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
-{
- u32 cus_per_node;
-
- if (c->x86 >= 0x17)
- return;
-
- cus_per_node = c->x86_max_cores / nodes_per_socket;
- c->topo.core_id %= cus_per_node;
-}
-
-/*
- * Fixup core topology information for
- * (1) AMD multi-node processors
- * Assumption: Number of cores in each internal node is the same.
- * (2) AMD processors supporting compute units
- */
-static void amd_get_topology(struct cpuinfo_x86 *c)
-{
- /* get information required for multi-node processors */
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- int err;
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-
- c->topo.die_id = ecx & 0xff;
-
- if (c->x86 == 0x15)
- c->topo.cu_id = ebx & 0xff;
-
- if (c->x86 >= 0x17) {
- c->topo.core_id = ebx & 0xff;
-
- if (smp_num_siblings > 1)
- c->x86_max_cores /= smp_num_siblings;
- }
-
- /*
- * In case leaf B is available, use it to derive
- * topology information.
- */
- err = detect_extended_topology(c);
- if (!err)
- c->x86_coreid_bits = get_count_order(c->x86_max_cores);
-
- cacheinfo_amd_init_llc_id(c);
-
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->topo.die_id = value & 7;
- c->topo.llc_id = c->topo.die_id;
- } else
- return;
-
- if (nodes_per_socket > 1) {
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
- legacy_fixup_core_id(c);
- }
-}
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void amd_detect_cmp(struct cpuinfo_x86 *c)
-{
- unsigned bits;
-
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->topo.pkg_id = c->topo.initial_apicid >> bits;
- /* use socket ID also for last level cache */
- c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
-}
-
-u32 amd_get_nodes_per_socket(void)
-{
- return nodes_per_socket;
-}
-EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
-
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
@@ -523,29 +346,30 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void early_init_amd_mc(struct cpuinfo_x86 *c)
+static void bsp_determine_snp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+ cc_vendor = CC_VENDOR_AMD;
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
+ if (cpu_has(c, X86_FEATURE_SEV_SNP)) {
+ /*
+ * RMP table entry format is not architectural and is defined by the
+ * per-processor PPR. Restrict SNP support on the known CPU models
+ * for which the RMP table entry format is currently defined or for
+ * processors which support the architecturally defined RMPREAD
+ * instruction.
+ */
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+ cpu_feature_enabled(X86_FEATURE_ZEN4) ||
+ cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
+ snp_probe_rmptable_info()) {
+ cc_platform_set(CC_ATTR_HOST_SEV_SNP);
+ } else {
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
+ cc_platform_clear(CC_ATTR_HOST_SEV_SNP);
+ }
}
-
- c->x86_coreid_bits = bits;
#endif
}
@@ -581,18 +405,6 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- u32 ecx;
-
- ecx = cpuid_ecx(0x8000001e);
- __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
- } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
- }
-
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
!boot_cpu_has(X86_FEATURE_VIRT_SSBD) &&
c->x86 >= 0x15 && c->x86 <= 0x17) {
@@ -616,6 +428,62 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
}
resctrl_cpu_detect(c);
+
+ /* Figure out Zen generations: */
+ switch (c->x86) {
+ case 0x17:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x50 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN1);
+ break;
+ case 0x30 ... 0x4f:
+ case 0x60 ... 0x7f:
+ case 0x90 ... 0x91:
+ case 0xa0 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN2);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x19:
+ switch (c->x86_model) {
+ case 0x00 ... 0x0f:
+ case 0x20 ... 0x5f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN3);
+ break;
+ case 0x10 ... 0x1f:
+ case 0x60 ... 0xaf:
+ setup_force_cpu_cap(X86_FEATURE_ZEN4);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ case 0x1a:
+ switch (c->x86_model) {
+ case 0x00 ... 0x2f:
+ case 0x40 ... 0x4f:
+ case 0x60 ... 0x7f:
+ setup_force_cpu_cap(X86_FEATURE_ZEN5);
+ break;
+ default:
+ goto warn;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ bsp_determine_snp(c);
+ return;
+
+warn:
+ WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -630,8 +498,8 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* SME feature (set in scattered.c).
* If the kernel has not enabled SME via any means then
* don't advertise the SME feature.
- * For SEV: If BIOS has not enabled SEV then don't advertise the
- * SEV and SEV_ES feature (set in scattered.c).
+ * For SEV: If BIOS has not enabled SEV then don't advertise SEV and
+ * any additional functionality based on it.
*
* In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32.
@@ -666,16 +534,14 @@ clear_all:
clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV);
setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
}
}
static void early_init_amd(struct cpuinfo_x86 *c)
{
- u64 value;
u32 dummy;
- early_init_amd_mc(c);
-
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
@@ -739,34 +605,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
- /*
- * Check whether the machine is affected by erratum 400. This is
- * used to select the proper idle routine and to enable the check
- * whether the machine is affected in arch_post_acpi_init(), which
- * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
- */
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_E400);
-
early_detect_mem_encrypt(c);
- /* Re-enable TopologyExtensions if switched off by BIOS */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x6f) &&
- !cpu_has(c, X86_FEATURE_TOPOEXT)) {
-
- if (msr_set_bit(0xc0011005, 54) > 0) {
- rdmsrl(0xc0011005, value);
- if (value & BIT_64(54)) {
- set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
- }
- }
- }
-
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) {
if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB))
setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE);
@@ -814,6 +654,16 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
msr_set_bit(MSR_K7_HWCR, 6);
#endif
set_cpu_bug(c, X86_BUG_SWAPGS_FENCE);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x41 ||
+ (c->x86_model == 0x41 && c->x86_stepping >= 0x2))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_gh(struct cpuinfo_x86 *c)
@@ -847,8 +697,17 @@ static void init_amd_gh(struct cpuinfo_x86 *c)
*/
msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+
+ /*
+ * Check models and steppings affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the
+ * check whether the machine is affected in arch_post_acpi_subsys_init()
+ * which sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (c->x86_model > 0x2 ||
+ (c->x86_model == 0x2 && c->x86_stepping >= 0x1))
+ setup_force_cpu_bug(X86_BUG_AMD_E400);
}
static void init_amd_ln(struct cpuinfo_x86 *c)
@@ -941,9 +800,33 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
clear_rdrand_cpuid_bit(c);
}
+static const struct x86_cpu_id erratum_1386_microcode[] = {
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
+ X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
+};
+
+static void fix_erratum_1386(struct cpuinfo_x86 *c)
+{
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ *
+ * Clear the feature flag only on microcode revisions which
+ * don't have the fix.
+ */
+ if (x86_match_min_microcode_rev(erratum_1386_microcode))
+ return;
+
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
+}
+
void init_spectral_chicken(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
u64 value;
/*
@@ -951,34 +834,27 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
*
* This suppresses speculation from the middle of a basic block, i.e. it
* suppresses non-branch predictions.
- *
- * We use STIBP as a heuristic to filter out Zen2 from the rest of F17H
*/
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has(c, X86_FEATURE_AMD_STIBP)) {
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) {
value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT;
wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value);
}
}
#endif
- /*
- * Work around Erratum 1386. The XSAVES instruction malfunctions in
- * certain circumstances on Zen1/2 uarch, and not all parts have had
- * updated microcode at the time of writing (March 2023).
- *
- * Affected parts all have no supervisor XSAVE states, meaning that
- * the XSAVEC instruction (which works fine) is equivalent.
- */
- clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
-static void init_amd_zn(struct cpuinfo_x86 *c)
+static void init_amd_zen_common(void)
{
- set_cpu_cap(c, X86_FEATURE_ZEN);
-
+ setup_force_cpu_cap(X86_FEATURE_ZEN);
#ifdef CONFIG_NUMA
node_reclaim_distance = 32;
#endif
+}
+
+static void init_amd_zen1(struct cpuinfo_x86 *c)
+{
+ fix_erratum_1386(c);
/* Fix up CPUID bits, but only if not virtualised. */
if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
@@ -986,15 +862,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
/* Erratum 1076: CPB feature bit not being set in CPUID. */
if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
-
- /*
- * Zen3 (Fam19 model < 0x10) parts are not susceptible to
- * Branch Type Confusion, but predate the allocation of the
- * BTC_NO bit.
- */
- if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
- set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
+
+ pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
+ setup_force_cpu_bug(X86_BUG_DIV0);
}
static bool cpu_has_zenbleed_microcode(void)
@@ -1002,11 +873,11 @@ static bool cpu_has_zenbleed_microcode(void)
u32 good_rev = 0;
switch (boot_cpu_data.x86_model) {
- case 0x30 ... 0x3f: good_rev = 0x0830107a; break;
- case 0x60 ... 0x67: good_rev = 0x0860010b; break;
- case 0x68 ... 0x6f: good_rev = 0x08608105; break;
- case 0x70 ... 0x7f: good_rev = 0x08701032; break;
- case 0xa0 ... 0xaf: good_rev = 0x08a00008; break;
+ case 0x30 ... 0x3f: good_rev = 0x0830107b; break;
+ case 0x60 ... 0x67: good_rev = 0x0860010c; break;
+ case 0x68 ... 0x6f: good_rev = 0x08608107; break;
+ case 0x70 ... 0x7f: good_rev = 0x08701033; break;
+ case 0xa0 ... 0xaf: good_rev = 0x08a00009; break;
default:
return false;
@@ -1018,11 +889,8 @@ static bool cpu_has_zenbleed_microcode(void)
return true;
}
-static void zenbleed_check(struct cpuinfo_x86 *c)
+static void zen2_zenbleed_check(struct cpuinfo_x86 *c)
{
- if (!cpu_has_amd_erratum(c, amd_zenbleed))
- return;
-
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return;
@@ -1037,6 +905,47 @@ static void zenbleed_check(struct cpuinfo_x86 *c)
}
}
+static void init_amd_zen2(struct cpuinfo_x86 *c)
+{
+ init_spectral_chicken(c);
+ fix_erratum_1386(c);
+ zen2_zenbleed_check(c);
+}
+
+static void init_amd_zen3(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) {
+ /*
+ * Zen3 (Fam19 model < 0x10) parts are not susceptible to
+ * Branch Type Confusion, but predate the allocation of the
+ * BTC_NO bit.
+ */
+ if (!cpu_has(c, X86_FEATURE_BTC_NO))
+ set_cpu_cap(c, X86_FEATURE_BTC_NO);
+ }
+}
+
+static void init_amd_zen4(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
+}
+
+static void init_amd_zen5(struct cpuinfo_x86 *c)
+{
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
u64 vm_cr;
@@ -1056,9 +965,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_FSRM))
set_cpu_cap(c, X86_FEATURE_FSRS);
- /* get apicid instead of initial apic id from cpuid */
- c->topo.apicid = read_apic_id();
-
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
clear_cpu_cap(c, X86_FEATURE_MCE);
@@ -1072,12 +978,27 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_spectral_chicken(c);
- fallthrough;
- case 0x19: init_amd_zn(c); break;
}
/*
+ * Save up on some future enablement work and do common Zen
+ * settings.
+ */
+ if (c->x86 >= 0x17)
+ init_amd_zen_common();
+
+ if (boot_cpu_has(X86_FEATURE_ZEN1))
+ init_amd_zen1(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN2))
+ init_amd_zen2(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN3))
+ init_amd_zen3(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN4))
+ init_amd_zen4(c);
+ else if (boot_cpu_has(X86_FEATURE_ZEN5))
+ init_amd_zen5(c);
+
+ /*
* Enable workaround for FXSAVE leak on CPUs
* without a XSaveErPtr feature
*/
@@ -1086,8 +1007,6 @@ static void init_amd(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- amd_detect_cmp(c);
- amd_get_topology(c);
srat_detect_node(c);
init_amd_cacheinfo(c);
@@ -1136,7 +1055,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* Counter May Be Inaccurate".
*/
if (cpu_has(c, X86_FEATURE_IRPERF) &&
- !cpu_has_amd_erratum(c, amd_erratum_1054))
+ (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f))
msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
check_null_seg_clears_base(c);
@@ -1150,18 +1069,10 @@ static void init_amd(struct cpuinfo_x86 *c)
*/
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
cpu_has(c, X86_FEATURE_AUTOIBRS))
- WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS));
-
- zenbleed_check(c);
+ WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS) < 0);
- if (cpu_has_amd_erratum(c, amd_div0)) {
- pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n");
- setup_force_cpu_bug(X86_BUG_DIV0);
- }
-
- if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
- cpu_has_amd_erratum(c, amd_erratum_1485))
- msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
#ifdef CONFIG_X86_32
@@ -1295,27 +1206,11 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
}
EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
-u32 amd_get_highest_perf(void)
-{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
- (c->x86_model >= 0x70 && c->x86_model < 0x80)))
- return 166;
-
- if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
- (c->x86_model >= 0x40 && c->x86_model < 0x70)))
- return 166;
-
- return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
static void zenbleed_check_cpu(void *unused)
{
struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- zenbleed_check(c);
+ zen2_zenbleed_check(c);
}
void amd_check_microcode(void)
@@ -1323,16 +1218,6 @@ void amd_check_microcode(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return;
- on_each_cpu(zenbleed_check_cpu, NULL, 1);
-}
-
-/*
- * Issue a DIV 0/1 insn to clear any division data from previous DIV
- * operations.
- */
-void noinstr amd_clear_divider(void)
-{
- asm volatile(ALTERNATIVE("", "div %2\n\t", X86_BUG_DIV0)
- :: "a" (0), "d" (0), "r" (1));
+ if (cpu_feature_enabled(X86_FEATURE_ZEN2))
+ on_each_cpu(zenbleed_check_cpu, NULL, 1);
}
-EXPORT_SYMBOL_GPL(amd_clear_divider);
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index fdbb5f07448f..f642de2ebdac 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -124,25 +124,24 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
return true;
}
-#define X86_MATCH(model) \
- X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
+#define X86_MATCH(vfm) \
+ X86_MATCH_VFM_FEATURE(vfm, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = {
- X86_MATCH(XEON_PHI_KNL),
- X86_MATCH(XEON_PHI_KNM),
+ X86_MATCH(INTEL_XEON_PHI_KNL),
+ X86_MATCH(INTEL_XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = {
- X86_MATCH(SKYLAKE_X),
+ X86_MATCH(INTEL_SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = {
- X86_MATCH(ATOM_GOLDMONT),
- X86_MATCH(ATOM_GOLDMONT_D),
- X86_MATCH(ATOM_GOLDMONT_PLUS),
+ X86_MATCH(INTEL_ATOM_GOLDMONT),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_D),
+ X86_MATCH(INTEL_ATOM_GOLDMONT_PLUS),
{}
};
@@ -307,7 +306,7 @@ static void freq_invariance_enable(void)
WARN_ON_ONCE(1);
return;
}
- static_branch_enable(&arch_scale_freq_key);
+ static_branch_enable_cpuslocked(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
}
@@ -324,8 +323,10 @@ static void __init bp_init_freq_invariance(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return;
- if (intel_set_max_freq_ratio())
+ if (intel_set_max_freq_ratio()) {
+ guard(cpus_read_lock)();
freq_invariance_enable();
+ }
}
static void disable_freq_invariance_workfn(struct work_struct *work)
@@ -346,10 +347,91 @@ static DECLARE_WORK(disable_freq_invariance_work,
disable_freq_invariance_workfn);
DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
+
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+ unsigned long capacity;
+ unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+ int cpu;
+
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+ return true;
+ }
+
+ arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+ if (!arch_cpu_scale)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+ per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+ }
+
+ static_branch_enable(&arch_hybrid_cap_scale_key);
+
+ pr_info("Hybrid CPU capacity scaling enabled\n");
+
+ return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter. Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+ unsigned long cap_freq, unsigned long base_freq)
+{
+ if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+ div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+ WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+ div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+ } else {
+ WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+ }
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+ return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
static void scale_freq_tick(u64 acnt, u64 mcnt)
{
- u64 freq_scale;
+ u64 freq_scale, freq_ratio;
if (!arch_scale_freq_invariant())
return;
@@ -357,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
goto error;
- if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+ if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+ freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+ else
+ freq_ratio = arch_max_freq_ratio;
+
+ if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
goto error;
freq_scale = div64_u64(acnt, mcnt);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bb0ab8466b91..a5d0998d7604 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -26,7 +26,7 @@
#include <asm/msr.h>
#include <asm/vmx.h>
#include <asm/paravirt.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/e820/api.h>
#include <asm/hypervisor.h>
#include <asm/tlbflush.h>
@@ -56,11 +56,13 @@ EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
/* The current value of the SPEC_CTRL MSR with task-specific bits set */
DEFINE_PER_CPU(u64, x86_spec_ctrl_current);
-EXPORT_SYMBOL_GPL(x86_spec_ctrl_current);
+EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current);
u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB;
EXPORT_SYMBOL_GPL(x86_pred_cmd);
+static u64 __ro_after_init x86_arch_cap_msr;
+
static DEFINE_MUTEX(spec_ctrl_mutex);
void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk;
@@ -111,9 +113,6 @@ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
/* Control unconditional IBPB in switch_mm() */
DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
-/* Control MDS CPU buffer clear before returning to user space */
-DEFINE_STATIC_KEY_FALSE(mds_user_clear);
-EXPORT_SYMBOL_GPL(mds_user_clear);
/* Control MDS CPU buffer clear before idling (halt, mwait) */
DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
EXPORT_SYMBOL_GPL(mds_idle_clear);
@@ -147,6 +146,8 @@ void __init cpu_select_mitigations(void)
x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK;
}
+ x86_arch_cap_msr = x86_read_arch_cap_msr();
+
/* Select the proper CPU mitigations before patching alternatives: */
spectre_v1_select_mitigation();
spectre_v2_select_mitigation();
@@ -232,7 +233,8 @@ static void x86_amd_ssb_disable(void)
#define pr_fmt(fmt) "MDS: " fmt
/* Default mitigation for MDS-affected CPUs */
-static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
+static enum mds_mitigations mds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MDS) ? MDS_MITIGATION_FULL : MDS_MITIGATION_OFF;
static bool mds_nosmt __ro_after_init = false;
static const char * const mds_strings[] = {
@@ -252,7 +254,7 @@ static void __init mds_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
mds_mitigation = MDS_MITIGATION_VMWERV;
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
(mds_nosmt || cpu_mitigations_auto_nosmt()))
@@ -292,7 +294,8 @@ enum taa_mitigations {
};
/* Default mitigation for TAA-affected CPUs */
-static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
+static enum taa_mitigations taa_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_TAA) ? TAA_MITIGATION_VERW : TAA_MITIGATION_OFF;
static bool taa_nosmt __ro_after_init;
static const char * const taa_strings[] = {
@@ -304,8 +307,6 @@ static const char * const taa_strings[] = {
static void __init taa_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_TAA)) {
taa_mitigation = TAA_MITIGATION_OFF;
return;
@@ -344,9 +345,8 @@ static void __init taa_select_mitigation(void)
* On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode
* update is required.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ( (ia32_cap & ARCH_CAP_MDS_NO) &&
- !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR))
+ if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR))
taa_mitigation = TAA_MITIGATION_UCODE_NEEDED;
/*
@@ -356,7 +356,7 @@ static void __init taa_select_mitigation(void)
* For guests that can't determine whether the correct microcode is
* present on host, enable the mitigation for UCODE_NEEDED as well.
*/
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
if (taa_nosmt || cpu_mitigations_auto_nosmt())
cpu_smt_disable(false);
@@ -393,7 +393,8 @@ enum mmio_mitigations {
};
/* Default mitigation for Processor MMIO Stale Data vulnerabilities */
-static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW;
+static enum mmio_mitigations mmio_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_MMIO_STALE_DATA) ? MMIO_MITIGATION_VERW : MMIO_MITIGATION_OFF;
static bool mmio_nosmt __ro_after_init = false;
static const char * const mmio_strings[] = {
@@ -404,8 +405,6 @@ static const char * const mmio_strings[] = {
static void __init mmio_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) ||
boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) ||
cpu_mitigations_off()) {
@@ -416,15 +415,20 @@ static void __init mmio_select_mitigation(void)
if (mmio_mitigation == MMIO_MITIGATION_OFF)
return;
- ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable CPU buffer clear mitigation for host and VMM, if also affected
* by MDS or TAA. Otherwise, enable mitigation for VMM only.
*/
if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) &&
boot_cpu_has(X86_FEATURE_RTM)))
- static_branch_enable(&mds_user_clear);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+
+ /*
+ * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based
+ * mitigations, disable KVM-only mitigation in that case.
+ */
+ if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
+ static_branch_disable(&mmio_stale_data_clear);
else
static_branch_enable(&mmio_stale_data_clear);
@@ -433,7 +437,7 @@ static void __init mmio_select_mitigation(void)
* be propagated to uncore buffers, clearing the Fill buffers on idle
* is required irrespective of SMT state.
*/
- if (!(ia32_cap & ARCH_CAP_FBSDP_NO))
+ if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO))
static_branch_enable(&mds_idle_clear);
/*
@@ -443,10 +447,10 @@ static void __init mmio_select_mitigation(void)
* FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS
* affected systems.
*/
- if ((ia32_cap & ARCH_CAP_FB_CLEAR) ||
+ if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) ||
(boot_cpu_has(X86_FEATURE_MD_CLEAR) &&
boot_cpu_has(X86_FEATURE_FLUSH_L1D) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)))
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))
mmio_mitigation = MMIO_MITIGATION_VERW;
else
mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED;
@@ -477,6 +481,57 @@ static int __init mmio_stale_data_parse_cmdline(char *str)
early_param("mmio_stale_data", mmio_stale_data_parse_cmdline);
#undef pr_fmt
+#define pr_fmt(fmt) "Register File Data Sampling: " fmt
+
+enum rfds_mitigations {
+ RFDS_MITIGATION_OFF,
+ RFDS_MITIGATION_VERW,
+ RFDS_MITIGATION_UCODE_NEEDED,
+};
+
+/* Default mitigation for Register File Data Sampling */
+static enum rfds_mitigations rfds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_VERW : RFDS_MITIGATION_OFF;
+
+static const char * const rfds_strings[] = {
+ [RFDS_MITIGATION_OFF] = "Vulnerable",
+ [RFDS_MITIGATION_VERW] = "Mitigation: Clear Register File",
+ [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
+};
+
+static void __init rfds_select_mitigation(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) {
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ return;
+ }
+ if (rfds_mitigation == RFDS_MITIGATION_OFF)
+ return;
+
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF);
+ else
+ rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED;
+}
+
+static __init int rfds_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!boot_cpu_has_bug(X86_BUG_RFDS))
+ return 0;
+
+ if (!strcmp(str, "off"))
+ rfds_mitigation = RFDS_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+
+ return 0;
+}
+early_param("reg_file_data_sampling", rfds_parse_cmdline);
+
+#undef pr_fmt
#define pr_fmt(fmt) "" fmt
static void __init md_clear_update_mitigation(void)
@@ -484,12 +539,12 @@ static void __init md_clear_update_mitigation(void)
if (cpu_mitigations_off())
return;
- if (!static_key_enabled(&mds_user_clear))
+ if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF))
goto out;
/*
- * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data
- * mitigation, if necessary.
+ * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO
+ * Stale Data mitigation, if necessary.
*/
if (mds_mitigation == MDS_MITIGATION_OFF &&
boot_cpu_has_bug(X86_BUG_MDS)) {
@@ -501,11 +556,19 @@ static void __init md_clear_update_mitigation(void)
taa_mitigation = TAA_MITIGATION_VERW;
taa_select_mitigation();
}
- if (mmio_mitigation == MMIO_MITIGATION_OFF &&
- boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
+ /*
+ * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear
+ * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state.
+ */
+ if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) {
mmio_mitigation = MMIO_MITIGATION_VERW;
mmio_select_mitigation();
}
+ if (rfds_mitigation == RFDS_MITIGATION_OFF &&
+ boot_cpu_has_bug(X86_BUG_RFDS)) {
+ rfds_mitigation = RFDS_MITIGATION_VERW;
+ rfds_select_mitigation();
+ }
out:
if (boot_cpu_has_bug(X86_BUG_MDS))
pr_info("MDS: %s\n", mds_strings[mds_mitigation]);
@@ -515,6 +578,8 @@ out:
pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]);
else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN))
pr_info("MMIO Stale Data: Unknown: No mitigations\n");
+ if (boot_cpu_has_bug(X86_BUG_RFDS))
+ pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]);
}
static void __init md_clear_select_mitigation(void)
@@ -522,11 +587,12 @@ static void __init md_clear_select_mitigation(void)
mds_select_mitigation();
taa_select_mitigation();
mmio_select_mitigation();
+ rfds_select_mitigation();
/*
- * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update
- * and print their mitigation after MDS, TAA and MMIO Stale Data
- * mitigation selection is done.
+ * As these mitigations are inter-related and rely on VERW instruction
+ * to clear the microarchitural buffers, update and print their status
+ * after mitigation selection is done for each of these vulnerabilities.
*/
md_clear_update_mitigation();
}
@@ -542,7 +608,8 @@ enum srbds_mitigations {
SRBDS_MITIGATION_HYPERVISOR,
};
-static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
+static enum srbds_mitigations srbds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
@@ -593,8 +660,6 @@ void update_srbds_msr(void)
static void __init srbds_select_mitigation(void)
{
- u64 ia32_cap;
-
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return;
@@ -603,8 +668,7 @@ static void __init srbds_select_mitigation(void)
* are only exposed to SRBDS when TSX is enabled or when CPU is affected
* by Processor MMIO Stale Data vulnerability.
*/
- ia32_cap = x86_read_arch_cap_msr();
- if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
+ if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) &&
!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
@@ -671,11 +735,8 @@ enum gds_mitigations {
GDS_MITIGATION_HYPERVISOR,
};
-#if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION)
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE;
-#else
-static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL;
-#endif
+static enum gds_mitigations gds_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF;
static const char * const gds_strings[] = {
[GDS_MITIGATION_OFF] = "Vulnerable",
@@ -747,7 +808,7 @@ static void __init gds_select_mitigation(void)
/* Will verify below that mitigation _can_ be disabled */
/* No microcode */
- if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) {
+ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) {
if (gds_mitigation == GDS_MITIGATION_FORCE) {
/*
* This only needs to be done on the boot CPU so do it
@@ -811,7 +872,8 @@ enum spectre_v1_mitigation {
};
static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init =
- SPECTRE_V1_MITIGATION_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V1) ?
+ SPECTRE_V1_MITIGATION_AUTO : SPECTRE_V1_MITIGATION_NONE;
static const char * const spectre_v1_strings[] = {
[SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers",
@@ -926,7 +988,7 @@ static const char * const retbleed_strings[] = {
static enum retbleed_mitigation retbleed_mitigation __ro_after_init =
RETBLEED_MITIGATION_NONE;
static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init =
- RETBLEED_CMD_AUTO;
+ IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF;
static int __ro_after_init retbleed_nosmt = false;
@@ -982,10 +1044,10 @@ static void __init retbleed_select_mitigation(void)
return;
case RETBLEED_CMD_UNRET:
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) {
retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
} else {
- pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n");
goto do_cmd_auto;
}
break;
@@ -994,24 +1056,24 @@ static void __init retbleed_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_IBPB)) {
pr_err("WARNING: CPU does not support IBPB.\n");
goto do_cmd_auto;
- } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
} else {
- pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
goto do_cmd_auto;
}
break;
case RETBLEED_CMD_STUFF:
- if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) &&
+ if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) &&
spectre_v2_enabled == SPECTRE_V2_RETPOLINE) {
retbleed_mitigation = RETBLEED_MITIGATION_STUFF;
} else {
- if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING))
+ if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))
pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n");
else
- pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n");
goto do_cmd_auto;
}
@@ -1021,9 +1083,10 @@ do_cmd_auto:
case RETBLEED_CMD_AUTO:
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) {
- if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY))
+ if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY))
retbleed_mitigation = RETBLEED_MITIGATION_UNRET;
- else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB))
+ else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) &&
+ boot_cpu_has(X86_FEATURE_IBPB))
retbleed_mitigation = RETBLEED_MITIGATION_IBPB;
}
@@ -1054,6 +1117,22 @@ do_cmd_auto:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like SRSO has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
+
break;
case RETBLEED_MITIGATION_STUFF:
@@ -1102,7 +1181,7 @@ static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init =
static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init =
SPECTRE_V2_USER_NONE;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
@@ -1386,17 +1465,18 @@ static void __init spec_v2_print_cond(const char *reason, bool secure)
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
{
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+ enum spectre_v2_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? SPECTRE_V2_CMD_AUTO : SPECTRE_V2_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
cpu_mitigations_off())
return SPECTRE_V2_CMD_NONE;
ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
if (ret < 0)
- return SPECTRE_V2_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
if (!match_option(arg, ret, mitigation_options[i].option))
@@ -1406,8 +1486,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPECTRE_V2_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
@@ -1415,7 +1495,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC ||
cmd == SPECTRE_V2_CMD_EIBRS_LFENCE ||
cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) &&
- !IS_ENABLED(CONFIG_RETPOLINE)) {
+ !IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1438,7 +1518,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
return SPECTRE_V2_CMD_AUTO;
}
- if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) {
+ if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY)) {
pr_err("%s selected but not compiled in. Switching to AUTO select\n",
mitigation_options[i].option);
return SPECTRE_V2_CMD_AUTO;
@@ -1469,7 +1549,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
{
- if (!IS_ENABLED(CONFIG_RETPOLINE)) {
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE)) {
pr_err("Kernel not compiled with retpoline; no mitigation available!");
return SPECTRE_V2_NONE;
}
@@ -1477,20 +1557,25 @@ static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void)
return SPECTRE_V2_RETPOLINE;
}
+static bool __ro_after_init rrsba_disabled;
+
/* Disable in-kernel use of non-RSB RET predictors */
static void __init spec_ctrl_disable_kernel_rrsba(void)
{
- u64 ia32_cap;
+ if (rrsba_disabled)
+ return;
- if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ if (!(x86_arch_cap_msr & ARCH_CAP_RRSBA)) {
+ rrsba_disabled = true;
return;
+ }
- ia32_cap = x86_read_arch_cap_msr();
+ if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL))
+ return;
- if (ia32_cap & ARCH_CAP_RRSBA) {
- x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
- update_spec_ctrl(x86_spec_ctrl_base);
- }
+ x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ rrsba_disabled = true;
}
static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
@@ -1540,6 +1625,80 @@ static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_
dump_stack();
}
+/*
+ * Set BHI_DIS_S to prevent indirect branches in kernel to be influenced by
+ * branch history in userspace. Not needed if BHI_NO is set.
+ */
+static bool __init spec_ctrl_bhi_dis(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BHI_CTRL))
+ return false;
+
+ x86_spec_ctrl_base |= SPEC_CTRL_BHI_DIS_S;
+ update_spec_ctrl(x86_spec_ctrl_base);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_HW);
+
+ return true;
+}
+
+enum bhi_mitigations {
+ BHI_MITIGATION_OFF,
+ BHI_MITIGATION_ON,
+ BHI_MITIGATION_VMEXIT_ONLY,
+};
+
+static enum bhi_mitigations bhi_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF;
+
+static int __init spectre_bhi_parse_cmdline(char *str)
+{
+ if (!str)
+ return -EINVAL;
+
+ if (!strcmp(str, "off"))
+ bhi_mitigation = BHI_MITIGATION_OFF;
+ else if (!strcmp(str, "on"))
+ bhi_mitigation = BHI_MITIGATION_ON;
+ else if (!strcmp(str, "vmexit"))
+ bhi_mitigation = BHI_MITIGATION_VMEXIT_ONLY;
+ else
+ pr_err("Ignoring unknown spectre_bhi option (%s)", str);
+
+ return 0;
+}
+early_param("spectre_bhi", spectre_bhi_parse_cmdline);
+
+static void __init bhi_select_mitigation(void)
+{
+ if (bhi_mitigation == BHI_MITIGATION_OFF)
+ return;
+
+ /* Retpoline mitigates against BHI unless the CPU has RRSBA behavior */
+ if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) {
+ spec_ctrl_disable_kernel_rrsba();
+ if (rrsba_disabled)
+ return;
+ }
+
+ /* Mitigate in hardware if supported */
+ if (spec_ctrl_bhi_dis())
+ return;
+
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return;
+
+ if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) {
+ pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+ return;
+ }
+
+ pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n");
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP);
+ setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT);
+}
+
static void __init spectre_v2_select_mitigation(void)
{
enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -1564,7 +1723,7 @@ static void __init spectre_v2_select_mitigation(void)
break;
}
- if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) &&
+ if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) &&
boot_cpu_has_bug(X86_BUG_RETBLEED) &&
retbleed_cmd != RETBLEED_CMD_OFF &&
retbleed_cmd != RETBLEED_CMD_STUFF &&
@@ -1651,6 +1810,9 @@ static void __init spectre_v2_select_mitigation(void)
mode == SPECTRE_V2_RETPOLINE)
spec_ctrl_disable_kernel_rrsba();
+ if (boot_cpu_has(X86_BUG_BHI))
+ bhi_select_mitigation();
+
spectre_v2_enabled = mode;
pr_info("%s\n", spectre_v2_strings[mode]);
@@ -1765,8 +1927,6 @@ static void update_indir_branch_cond(void)
/* Update the static key controlling the MDS CPU buffer clear in idle */
static void update_mds_branch_idle(void)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
-
/*
* Enable the idle clearing if SMT is active on CPUs which are
* affected only by MSBDS and not any other MDS variant.
@@ -1781,7 +1941,7 @@ static void update_mds_branch_idle(void)
if (sched_smt_active()) {
static_branch_enable(&mds_idle_clear);
} else if (mmio_mitigation == MMIO_MITIGATION_OFF ||
- (ia32_cap & ARCH_CAP_FBSDP_NO)) {
+ (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) {
static_branch_disable(&mds_idle_clear);
}
}
@@ -1880,10 +2040,12 @@ static const struct {
static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
{
- enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO;
+ enum ssb_mitigation_cmd cmd;
char arg[20];
int ret, i;
+ cmd = IS_ENABLED(CONFIG_MITIGATION_SSB) ?
+ SPEC_STORE_BYPASS_CMD_AUTO : SPEC_STORE_BYPASS_CMD_NONE;
if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
cpu_mitigations_off()) {
return SPEC_STORE_BYPASS_CMD_NONE;
@@ -1891,7 +2053,7 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
arg, sizeof(arg));
if (ret < 0)
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ return cmd;
for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) {
if (!match_option(arg, ret, ssb_mitigation_options[i].option))
@@ -1902,8 +2064,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
}
if (i >= ARRAY_SIZE(ssb_mitigation_options)) {
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
- return SPEC_STORE_BYPASS_CMD_AUTO;
+ pr_err("unknown option (%s). Switching to default mode\n", arg);
+ return cmd;
}
}
@@ -2230,7 +2392,8 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation);
#define pr_fmt(fmt) "L1TF: " fmt
/* Default mitigation for L1TF-affected CPUs */
-enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH;
+enum l1tf_mitigations l1tf_mitigation __ro_after_init =
+ IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF;
#if IS_ENABLED(CONFIG_KVM_INTEL)
EXPORT_SYMBOL_GPL(l1tf_mitigation);
#endif
@@ -2256,20 +2419,20 @@ static void override_cache_bits(struct cpuinfo_x86 *c)
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_SANDYBRIDGE:
- case INTEL_FAM6_IVYBRIDGE:
- case INTEL_FAM6_HASWELL:
- case INTEL_FAM6_HASWELL_L:
- case INTEL_FAM6_HASWELL_G:
- case INTEL_FAM6_BROADWELL:
- case INTEL_FAM6_BROADWELL_G:
- case INTEL_FAM6_SKYLAKE_L:
- case INTEL_FAM6_SKYLAKE:
- case INTEL_FAM6_KABYLAKE_L:
- case INTEL_FAM6_KABYLAKE:
+ switch (c->x86_vfm) {
+ case INTEL_NEHALEM:
+ case INTEL_WESTMERE:
+ case INTEL_SANDYBRIDGE:
+ case INTEL_IVYBRIDGE:
+ case INTEL_HASWELL:
+ case INTEL_HASWELL_L:
+ case INTEL_HASWELL_G:
+ case INTEL_BROADWELL:
+ case INTEL_BROADWELL_G:
+ case INTEL_SKYLAKE_L:
+ case INTEL_SKYLAKE:
+ case INTEL_KABYLAKE_L:
+ case INTEL_KABYLAKE:
if (c->x86_cache_bits < 44)
c->x86_cache_bits = 44;
break;
@@ -2410,10 +2573,9 @@ static void __init srso_select_mitigation(void)
{
bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE);
- if (cpu_mitigations_off())
- return;
-
- if (!boot_cpu_has_bug(X86_BUG_SRSO)) {
+ if (!boot_cpu_has_bug(X86_BUG_SRSO) ||
+ cpu_mitigations_off() ||
+ srso_cmd == SRSO_CMD_OFF) {
if (boot_cpu_has(X86_FEATURE_SBPB))
x86_pred_cmd = PRED_CMD_SBPB;
return;
@@ -2444,11 +2606,6 @@ static void __init srso_select_mitigation(void)
}
switch (srso_cmd) {
- case SRSO_CMD_OFF:
- if (boot_cpu_has(X86_FEATURE_SBPB))
- x86_pred_cmd = PRED_CMD_SBPB;
- return;
-
case SRSO_CMD_MICROCODE:
if (has_microcode) {
srso_mitigation = SRSO_MITIGATION_MICROCODE;
@@ -2457,7 +2614,10 @@ static void __init srso_select_mitigation(void)
break;
case SRSO_CMD_SAFE_RET:
- if (IS_ENABLED(CONFIG_CPU_SRSO)) {
+ if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO))
+ goto ibpb_on_vmexit;
+
+ if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) {
/*
* Enable the return thunk for generated code
* like ftrace, static_call, etc.
@@ -2477,30 +2637,56 @@ static void __init srso_select_mitigation(void)
else
srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED;
} else {
- pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n");
}
break;
case SRSO_CMD_IBPB:
- if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
+ setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB;
+
+ /*
+ * IBPB on entry already obviates the need for
+ * software-based untraining so clear those in case some
+ * other mitigation like Retbleed has selected them.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_UNRET);
+ setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
+
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
- pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
}
break;
+ibpb_on_vmexit:
case SRSO_CMD_IBPB_ON_VMEXIT:
- if (IS_ENABLED(CONFIG_CPU_SRSO)) {
- if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
+ if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) {
+ if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
+
+ /*
+ * There is no need for RSB filling: entry_ibpb() ensures
+ * all predictions, including the RSB, are invalidated,
+ * regardless of IBPB implementation.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
}
} else {
- pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");
- }
+ pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n");
+ }
+ break;
+ default:
break;
}
@@ -2615,6 +2801,11 @@ static ssize_t mmio_stale_data_show_state(char *buf)
sched_smt_active() ? "vulnerable" : "disabled");
}
+static ssize_t rfds_show_state(char *buf)
+{
+ return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]);
+}
+
static char *stibp_state(void)
{
if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) &&
@@ -2623,15 +2814,15 @@ static char *stibp_state(void)
switch (spectre_v2_user_stibp) {
case SPECTRE_V2_USER_NONE:
- return ", STIBP: disabled";
+ return "; STIBP: disabled";
case SPECTRE_V2_USER_STRICT:
- return ", STIBP: forced";
+ return "; STIBP: forced";
case SPECTRE_V2_USER_STRICT_PREFERRED:
- return ", STIBP: always-on";
+ return "; STIBP: always-on";
case SPECTRE_V2_USER_PRCTL:
case SPECTRE_V2_USER_SECCOMP:
if (static_key_enabled(&switch_to_cond_stibp))
- return ", STIBP: conditional";
+ return "; STIBP: conditional";
}
return "";
}
@@ -2640,10 +2831,10 @@ static char *ibpb_state(void)
{
if (boot_cpu_has(X86_FEATURE_IBPB)) {
if (static_key_enabled(&switch_mm_always_ibpb))
- return ", IBPB: always-on";
+ return "; IBPB: always-on";
if (static_key_enabled(&switch_mm_cond_ibpb))
- return ", IBPB: conditional";
- return ", IBPB: disabled";
+ return "; IBPB: conditional";
+ return "; IBPB: disabled";
}
return "";
}
@@ -2653,14 +2844,32 @@ static char *pbrsb_eibrs_state(void)
if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
- return ", PBRSB-eIBRS: SW sequence";
+ return "; PBRSB-eIBRS: SW sequence";
else
- return ", PBRSB-eIBRS: Vulnerable";
+ return "; PBRSB-eIBRS: Vulnerable";
} else {
- return ", PBRSB-eIBRS: Not affected";
+ return "; PBRSB-eIBRS: Not affected";
}
}
+static const char *spectre_bhi_state(void)
+{
+ if (!boot_cpu_has_bug(X86_BUG_BHI))
+ return "; BHI: Not affected";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_HW))
+ return "; BHI: BHI_DIS_S";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP))
+ return "; BHI: SW loop, KVM: SW loop";
+ else if (boot_cpu_has(X86_FEATURE_RETPOLINE) &&
+ !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) &&
+ rrsba_disabled)
+ return "; BHI: Retpoline";
+ else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT))
+ return "; BHI: Vulnerable, KVM: SW loop";
+
+ return "; BHI: Vulnerable";
+}
+
static ssize_t spectre_v2_show_state(char *buf)
{
if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
@@ -2673,13 +2882,15 @@ static ssize_t spectre_v2_show_state(char *buf)
spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
- return sysfs_emit(buf, "%s%s%s%s%s%s%s\n",
+ return sysfs_emit(buf, "%s%s%s%s%s%s%s%s\n",
spectre_v2_strings[spectre_v2_enabled],
ibpb_state(),
- boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
+ boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? "; IBRS_FW" : "",
stibp_state(),
- boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? "; RSB filling" : "",
pbrsb_eibrs_state(),
+ spectre_bhi_state(),
+ /* this should always be at the end */
spectre_v2_module_string());
}
@@ -2774,6 +2985,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_GDS:
return gds_show_state(buf);
+ case X86_BUG_RFDS:
+ return rfds_show_state(buf);
+
default:
break;
}
@@ -2848,4 +3062,14 @@ ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *bu
{
return cpu_show_common(dev, attr, buf, X86_BUG_GDS);
}
+
+ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpu_show_common(dev, attr, buf, X86_BUG_RFDS);
+}
#endif
+
+void __warn_thunk(void)
+{
+ WARN_ONCE(1, "Unpatched return thunk in use. This should not happen!\n");
+}
diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c
new file mode 100644
index 000000000000..6cba85c79d42
--- /dev/null
+++ b/arch/x86/kernel/cpu/bus_lock.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+#include <linux/semaphore.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+#include <linux/cpuhotplug.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
+#include <asm/cpu.h>
+
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+ sld_ratelimit,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
+/*
+ * With a name like MSR_TEST_CTL it should go without saying, but don't touch
+ * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
+ * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
+ */
+static bool cpu_model_supports_sld __ro_after_init;
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+ { "ratelimit:", sld_ratelimit },
+};
+
+static struct ratelimit_state bld_ratelimit;
+
+static unsigned int sysctl_sld_mitigate = 1;
+static DEFINE_SEMAPHORE(buslock_sem, 1);
+
+#ifdef CONFIG_PROC_SYSCTL
+static const struct ctl_table sld_sysctls[] = {
+ {
+ .procname = "split_lock_mitigate",
+ .data = &sysctl_sld_mitigate,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+};
+
+static int __init sld_mitigate_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sld_sysctls);
+ return 0;
+}
+
+late_initcall(sld_mitigate_sysctl_init);
+#endif
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt), ratelimit;
+
+ if (strncmp(arg, opt, len))
+ return false;
+
+ /*
+ * Min ratelimit is 1 bus lock/sec.
+ * Max ratelimit is 1000 bus locks/sec.
+ */
+ if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
+ ratelimit > 0 && ratelimit <= 1000) {
+ ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
+ ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
+ return true;
+ }
+
+ return len == arglen;
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrl(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init sld_state_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+ sld_state = state;
+}
+
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ /* Restore the MSR to its cached value. */
+ wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+void split_lock_init(void)
+{
+ /*
+ * #DB for bus lock handles ratelimit and #AC for split lock is
+ * disabled.
+ */
+ if (sld_state == sld_ratelimit) {
+ split_lock_verify_msr(false);
+ return;
+ }
+
+ if (cpu_model_supports_sld)
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void __split_lock_reenable_unlock(struct work_struct *work)
+{
+ sld_update_msr(true);
+ up(&buslock_sem);
+}
+
+static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
+
+static void __split_lock_reenable(struct work_struct *work)
+{
+ sld_update_msr(true);
+}
+static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
+
+/*
+ * If a CPU goes offline with pending delayed work to re-enable split lock
+ * detection then the delayed work will be executed on some other CPU. That
+ * handles releasing the buslock_sem, but because it executes on a
+ * different CPU probably won't re-enable split lock detection. This is a
+ * problem on HT systems since the sibling CPU on the same core may then be
+ * left running with split lock detection disabled.
+ *
+ * Unconditionally re-enable detection here.
+ */
+static int splitlock_cpu_offline(unsigned int cpu)
+{
+ sld_update_msr(true);
+
+ return 0;
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ struct delayed_work *work;
+ int cpu;
+
+ if (!current->reported_split_lock)
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+ current->reported_split_lock = 1;
+
+ if (sysctl_sld_mitigate) {
+ /*
+ * misery factor #1:
+ * sleep 10ms before trying to execute split lock.
+ */
+ if (msleep_interruptible(10) > 0)
+ return;
+ /*
+ * Misery factor #2:
+ * only allow one buslocked disabled core at a time.
+ */
+ if (down_interruptible(&buslock_sem) == -EINTR)
+ return;
+ work = &sl_reenable_unlock;
+ } else {
+ work = &sl_reenable;
+ }
+
+ cpu = get_cpu();
+ schedule_delayed_work_on(cpu, work, 2);
+
+ /* Disable split lock detection on this CPU to make progress */
+ sld_update_msr(false);
+ put_cpu();
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+void bus_lock_init(void)
+{
+ u64 val;
+
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ return;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+
+ if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off) {
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
+ } else {
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ }
+
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_ratelimit:
+ /* Enforce no more than bld_ratelimit bus locks/sec. */
+ while (!__ratelimit(&bld_ratelimit))
+ msleep(20);
+ /* Warn on the bus lock. */
+ fallthrough;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
+/*
+ * CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_L, 0),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, 0),
+ {}
+};
+
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ /* Check for CPUs that have support but do not enumerate it: */
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (m)
+ goto supported;
+
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+
+ /*
+ * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
+ * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
+ * it have split lock detection.
+ */
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
+ goto supported;
+
+ /* CPU is not in the model list and does not have the MSR bit: */
+ return;
+
+supported:
+ cpu_model_supports_sld = true;
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
+ pr_warn("No splitlock CPU offline handler\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: warning on user-space bus_locks\n");
+ }
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ case sld_ratelimit:
+ if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index c131c412db89..a6c6bccfa8b8 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -178,8 +178,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-static unsigned short num_cache_leaves;
-
/* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user. This makes some assumptions about the machine:
L2 not shared, no SMT etc. that is currently true on AMD CPUs.
@@ -301,7 +299,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
+ eax->split.num_cores_on_die = topology_num_cores_per_package();
if (assoc == 0xffff)
@@ -595,7 +593,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
if (index < 3)
return;
- node = topology_die_id(smp_processor_id());
+ node = topology_amd_node_id(smp_processor_id());
this_leaf->nb = node_to_amd_nb(node);
if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
amd_calc_l3_indices(this_leaf->nb);
@@ -661,7 +659,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -672,7 +670,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
- c->topo.llc_id = c->topo.die_id;
+ c->topo.llc_id = die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
@@ -717,20 +715,23 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c)
void init_amd_cacheinfo(struct cpuinfo_x86 *c)
{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- num_cache_leaves = find_num_cache_leaves(c);
+ ci->num_leaves = find_num_cache_leaves(c);
} else if (c->extended_cpuid_level >= 0x80000006) {
if (cpuid_edx(0x80000006) & 0xf000)
- num_cache_leaves = 4;
+ ci->num_leaves = 4;
else
- num_cache_leaves = 3;
+ ci->num_leaves = 3;
}
}
void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
{
- num_cache_leaves = find_num_cache_leaves(c);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
+
+ ci->num_leaves = find_num_cache_leaves(c);
}
void init_intel_cacheinfo(struct cpuinfo_x86 *c)
@@ -740,21 +741,21 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index);
if (c->cpuid_level > 3) {
- static int is_initialized;
-
- if (is_initialized == 0) {
- /* Init num_cache_leaves from boot CPU */
- num_cache_leaves = find_num_cache_leaves(c);
- is_initialized++;
- }
+ /*
+ * There should be at least one leaf. A non-zero value means
+ * that the number of leaves has been initialized.
+ */
+ if (!ci->num_leaves)
+ ci->num_leaves = find_num_cache_leaves(c);
/*
* Whenever possible use cpuid(4), deterministic cache
* parameters cpuid leaf to find the cache details
*/
- for (i = 0; i < num_cache_leaves; i++) {
+ for (i = 0; i < ci->num_leaves; i++) {
struct _cpuid4_info_regs this_leaf = {};
int retval;
@@ -790,14 +791,14 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
* Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for
* trace cache
*/
- if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
+ if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) {
/* supports eax=2 call */
int j, n;
unsigned int regs[4];
unsigned char *dp = (unsigned char *)regs;
int only_trace = 0;
- if (num_cache_leaves != 0 && c->x86 == 15)
+ if (ci->num_leaves && c->x86 == 15)
only_trace = 1;
/* Number of times to iterate */
@@ -807,7 +808,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
@@ -991,14 +992,12 @@ static void ci_leaf_init(struct cacheinfo *this_leaf,
int init_cache_level(unsigned int cpu)
{
- struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
- if (!num_cache_leaves)
+ /* There should be at least one leaf. */
+ if (!ci->num_leaves)
return -ENOENT;
- if (!this_cpu_ci)
- return -EINVAL;
- this_cpu_ci->num_levels = 3;
- this_cpu_ci->num_leaves = num_cache_leaves;
+
return 0;
}
@@ -1118,15 +1117,16 @@ static void cache_cpu_init(void)
unsigned long flags;
local_irq_save(flags);
- cache_disable();
- if (memory_caching_control & CACHE_MTRR)
+ if (memory_caching_control & CACHE_MTRR) {
+ cache_disable();
mtrr_generic_set_state();
+ cache_enable();
+ }
if (memory_caching_control & CACHE_PAT)
pat_cpu_init();
- cache_enable();
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 345f7d905db6..a3b55db35c96 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -128,10 +128,6 @@ static void init_centaur(struct cpuinfo_x86 *c)
#endif
early_init_centaur(c);
init_intel_cacheinfo(c);
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b14fc8c1c953..7cce91b19fb2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -29,6 +29,7 @@
#include <asm/alternative.h>
#include <asm/cmdline.h>
+#include <asm/cpuid.h>
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/doublefault.h>
@@ -61,19 +62,38 @@
#include <asm/microcode.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
#include <asm/ia32.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/sev.h>
+#include <asm/tdx.h>
+#include <asm/posted_intr.h>
+#include <asm/runtime-const.h>
#include "cpu.h"
+DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+EXPORT_PER_CPU_SYMBOL(cpu_info);
+
u32 elf_hwcap2 __read_mostly;
/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
+unsigned int __max_threads_per_core __ro_after_init = 1;
+EXPORT_SYMBOL(__max_threads_per_core);
+
+unsigned int __max_dies_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__max_dies_per_package);
+
+unsigned int __max_logical_packages __ro_after_init = 1;
+EXPORT_SYMBOL(__max_logical_packages);
+
+unsigned int __num_cores_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_cores_per_package);
+
+unsigned int __num_threads_per_package __ro_after_init = 1;
+EXPORT_SYMBOL(__num_threads_per_package);
static struct ppin_info {
int feature;
@@ -97,17 +117,17 @@ static const struct x86_cpu_id ppin_cpuids[] = {
X86_MATCH_FEATURE(X86_FEATURE_INTEL_PPIN, &ppin_info[X86_VENDOR_INTEL]),
/* Legacy models without CPUID enumeration */
- X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
- X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_IVYBRIDGE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_HASWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_BROADWELL_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SKYLAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_ICELAKE_D, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &ppin_info[X86_VENDOR_INTEL]),
+ X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &ppin_info[X86_VENDOR_INTEL]),
{}
};
@@ -150,7 +170,7 @@ static void ppin_init(struct cpuinfo_x86 *c)
}
clear_ppin:
- clear_cpu_cap(c, info->feature);
+ setup_clear_cpu_cap(info->feature);
}
static void default_init(struct cpuinfo_x86 *c)
@@ -188,45 +208,37 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
* TLS descriptors are currently at a different place compared to i386.
* Hopefully nobody expects them at a fixed place (Wine?)
*/
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE64, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA64, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA64 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE64 | DESC_USER, 0, 0xfffff),
#else
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff),
- [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(DESC_CODE32, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(DESC_CODE32 | DESC_USER, 0, 0xfffff),
+ [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(DESC_DATA32 | DESC_USER, 0, 0xfffff),
/*
* Segments used for calling PnP BIOS have byte granularity.
* They code segments and data segments have fixed 64k limits,
* the transfer segment sizes are set at run time.
*/
- /* 32-bit code */
- [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0),
- /* 16-bit data */
- [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0),
+ [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0xffff),
+ [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
+ [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(DESC_DATA16, 0, 0),
/*
* The APM segments have byte granularity and their bases
* are set at run time. All have 64k limits.
*/
- /* 32-bit code */
- [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff),
- /* 16-bit code */
- [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff),
- /* data */
- [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff),
-
- [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
+ [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(DESC_CODE32_BIOS, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(DESC_CODE16, 0, 0xffff),
+ [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(DESC_DATA32_BIOS, 0, 0xffff),
+
+ [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
+ [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(DESC_DATA32, 0, 0xfffff),
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -265,21 +277,13 @@ static int __init x86_noinvpcid_setup(char *s)
}
early_param("noinvpcid", x86_noinvpcid_setup);
-#ifdef CONFIG_X86_32
-static int cachesize_override = -1;
-static int disable_x86_serial_nr = 1;
-
-static int __init cachesize_setup(char *str)
-{
- get_option(&str, &cachesize_override);
- return 1;
-}
-__setup("cachesize=", cachesize_setup);
-
/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
+static inline bool flag_is_changeable_p(unsigned long flag)
{
- u32 f1, f2;
+ unsigned long f1, f2;
+
+ if (!IS_ENABLED(CONFIG_X86_32))
+ return true;
/*
* Cyrix and IDT cpus allow disabling of CPUID
@@ -302,11 +306,22 @@ static inline int flag_is_changeable_p(u32 flag)
: "=&r" (f1), "=&r" (f2)
: "ir" (flag));
- return ((f1^f2) & flag) != 0;
+ return (f1 ^ f2) & flag;
}
+#ifdef CONFIG_X86_32
+static int cachesize_override = -1;
+static int disable_x86_serial_nr = 1;
+
+static int __init cachesize_setup(char *str)
+{
+ get_option(&str, &cachesize_override);
+ return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
/* Probe for the CPUID instruction */
-int have_cpuid_p(void)
+bool have_cpuid_p(void)
{
return flag_is_changeable_p(X86_EFLAGS_ID);
}
@@ -338,10 +353,6 @@ static int __init x86_serial_nr_setup(char *s)
}
__setup("serialnumber", x86_serial_nr_setup);
#else
-static inline int flag_is_changeable_p(u32 flag)
-{
- return 1;
-}
static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
}
@@ -389,9 +400,8 @@ out:
}
/* These bits should not change their value after CPU init is finished. */
-static const unsigned long cr4_pinned_mask =
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
- X86_CR4_FSGSBASE | X86_CR4_CET;
+static const unsigned long cr4_pinned_mask = X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+ X86_CR4_FSGSBASE | X86_CR4_CET | X86_CR4_FRED;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -627,9 +637,9 @@ struct cpuid_dependent_feature {
static const struct cpuid_dependent_feature
cpuid_dependent_features[] = {
- { X86_FEATURE_MWAIT, 0x00000005 },
- { X86_FEATURE_DCA, 0x00000009 },
- { X86_FEATURE_XSAVE, 0x0000000d },
+ { X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
+ { X86_FEATURE_DCA, CPUID_LEAF_DCA },
+ { X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
{ 0, 0 }
};
@@ -797,19 +807,6 @@ static void get_model_name(struct cpuinfo_x86 *c)
*(s + 1) = '\0';
}
-void detect_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, ebx, ecx, edx;
-
- c->x86_max_cores = 1;
- if (!IS_ENABLED(CONFIG_SMP) || c->cpuid_level < 4)
- return;
-
- cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
- if (eax & 0x1f)
- c->x86_max_cores = (eax >> 26) + 1;
-}
-
void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, ebx, ecx, edx, l2size;
@@ -871,52 +868,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
-int detect_ht_early(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- u32 eax, ebx, ecx, edx;
-
- if (!cpu_has(c, X86_FEATURE_HT))
- return -1;
-
- if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
- return -1;
-
- if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
- return -1;
-
- cpuid(1, &eax, &ebx, &ecx, &edx);
-
- smp_num_siblings = (ebx & 0xff0000) >> 16;
- if (smp_num_siblings == 1)
- pr_info_once("CPU0: Hyper-Threading is disabled\n");
-#endif
- return 0;
-}
-
-void detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- int index_msb, core_bits;
-
- if (detect_ht_early(c) < 0)
- return;
-
- index_msb = get_count_order(smp_num_siblings);
- c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb);
-
- smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
- index_msb = get_count_order(smp_num_siblings);
-
- core_bits = get_count_order(c->x86_max_cores);
-
- c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid, index_msb) &
- ((1 << core_bits) - 1);
-#endif
-}
-
-static void get_cpu_vendor(struct cpuinfo_x86 *c)
+void get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
int i;
@@ -1103,18 +1055,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
void get_cpu_address_sizes(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- bool vp_bits_from_cpuid = true;
if (!cpu_has(c, X86_FEATURE_CPUID) ||
- (c->extended_cpuid_level < 0x80000008))
- vp_bits_from_cpuid = false;
-
- if (vp_bits_from_cpuid) {
- cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
-
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- } else {
+ (c->extended_cpuid_level < 0x80000008)) {
if (IS_ENABLED(CONFIG_X86_64)) {
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1128,14 +1071,23 @@ void get_cpu_address_sizes(struct cpuinfo_x86 *c)
cpu_has(c, X86_FEATURE_PSE36))
c->x86_phys_bits = 36;
}
+ } else {
+ cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
+
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+
+ /* Provide a sane default if not enumerated: */
+ if (!c->x86_clflush_size)
+ c->x86_clflush_size = 32;
}
+
c->x86_cache_bits = c->x86_phys_bits;
c->x86_cache_alignment = c->x86_clflush_size;
}
static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_32
int i;
/*
@@ -1156,7 +1108,6 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
break;
}
}
-#endif
}
#define NO_SPECULATION BIT(0)
@@ -1170,12 +1121,13 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define NO_SPECTRE_V2 BIT(8)
#define NO_MMIO BIT(9)
#define NO_EIBRS_PBRSB BIT(10)
+#define NO_BHI BIT(11)
#define VULNWL(vendor, family, model, whitelist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
-#define VULNWL_INTEL(model, whitelist) \
- VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
+#define VULNWL_INTEL(vfm, whitelist) \
+ X86_MATCH_VFM(vfm, whitelist)
#define VULNWL_AMD(family, whitelist) \
VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
@@ -1192,32 +1144,32 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
VULNWL(VORTEX, 6, X86_MODEL_ANY, NO_SPECULATION),
/* Intel Family 6 */
- VULNWL_INTEL(TIGERLAKE, NO_MMIO),
- VULNWL_INTEL(TIGERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE, NO_MMIO),
- VULNWL_INTEL(ALDERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_TIGERLAKE_L, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE, NO_MMIO),
+ VULNWL_INTEL(INTEL_ALDERLAKE_L, NO_MMIO),
- VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(CORE_YONAH, NO_SSB),
+ VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB),
- VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY),
+ VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT),
- VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_INTEL(INTEL_ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
/*
* Technically, swapgs isn't serializing on AMD (despite it previously
@@ -1227,33 +1179,31 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
* good enough for our purposes.
*/
- VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB),
- VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_L, NO_EIBRS_PBRSB),
+ VULNWL_INTEL(INTEL_ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB),
/* AMD Family 0xf - 0x12 */
- VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
- VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO),
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_BHI),
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
- VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
- VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB),
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
+ VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO | NO_EIBRS_PBRSB | NO_BHI),
/* Zhaoxin Family 7 */
- VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
- VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO),
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS | NO_MMIO | NO_BHI),
{}
};
#define VULNBL(vendor, family, model, blacklist) \
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
-#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
- X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
- INTEL_FAM6_##model, steppings, \
- X86_FEATURE_ANY, issues)
+#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
+ X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
#define VULNBL_AMD(family, blacklist) \
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
@@ -1274,42 +1224,54 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
#define SRSO BIT(5)
/* CPU is affected by GDS */
#define GDS BIT(6)
+/* CPU is affected by Register File Data Sampling */
+#define RFDS BIT(7)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
- VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(HASWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
- VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
- VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
- VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS),
- VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
- VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO),
- VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS),
+ VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
+ VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
+ VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
+ VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
+ VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
+ VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
+ VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
+ VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
VULNBL_AMD(0x15, RETBLEED),
VULNBL_AMD(0x16, RETBLEED),
VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO),
VULNBL_HYGON(0x18, RETBLEED | SMT_RSB | SRSO),
VULNBL_AMD(0x19, SRSO),
+ VULNBL_AMD(0x1a, SRSO),
{}
};
@@ -1322,28 +1284,46 @@ static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long whi
u64 x86_read_arch_cap_msr(void)
{
- u64 ia32_cap = 0;
+ u64 x86_arch_cap_msr = 0;
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr);
- return ia32_cap;
+ return x86_arch_cap_msr;
}
-static bool arch_cap_mmio_immune(u64 ia32_cap)
+static bool arch_cap_mmio_immune(u64 x86_arch_cap_msr)
{
- return (ia32_cap & ARCH_CAP_FBSDP_NO &&
- ia32_cap & ARCH_CAP_PSDP_NO &&
- ia32_cap & ARCH_CAP_SBDR_SSDP_NO);
+ return (x86_arch_cap_msr & ARCH_CAP_FBSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_PSDP_NO &&
+ x86_arch_cap_msr & ARCH_CAP_SBDR_SSDP_NO);
+}
+
+static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr)
+{
+ /* The "immunity" bit trumps everything else: */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_NO)
+ return false;
+
+ /*
+ * VMMs set ARCH_CAP_RFDS_CLEAR for processors not in the blacklist to
+ * indicate that mitigation is needed because guest is running on a
+ * vulnerable hardware or may migrate to such hardware:
+ */
+ if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR)
+ return true;
+
+ /* Only consult the blacklist when there is no enumeration: */
+ return cpu_matches(cpu_vuln_blacklist, RFDS);
}
static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
{
- u64 ia32_cap = x86_read_arch_cap_msr();
+ u64 x86_arch_cap_msr = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
- !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
@@ -1355,23 +1335,28 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
- !(ia32_cap & ARCH_CAP_SSB_NO) &&
+ !(x86_arch_cap_msr & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
/*
* AMD's AutoIBRS is equivalent to Intel's eIBRS - use the Intel feature
* flag and protect from vendor-specific bugs via the whitelist.
+ *
+ * Don't use AutoIBRS when SNP is enabled because it degrades host
+ * userspace indirect branch performance.
*/
- if ((ia32_cap & ARCH_CAP_IBRS_ALL) || cpu_has(c, X86_FEATURE_AUTOIBRS)) {
+ if ((x86_arch_cap_msr & ARCH_CAP_IBRS_ALL) ||
+ (cpu_has(c, X86_FEATURE_AUTOIBRS) &&
+ !cpu_feature_enabled(X86_FEATURE_SEV_SNP))) {
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (!cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) &&
- !(ia32_cap & ARCH_CAP_PBRSB_NO))
+ !(x86_arch_cap_msr & ARCH_CAP_PBRSB_NO))
setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB);
}
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
- !(ia32_cap & ARCH_CAP_MDS_NO)) {
+ !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
@@ -1390,9 +1375,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* TSX_CTRL check alone is not sufficient for cases when the microcode
* update is not present or running as guest that don't get TSX_CTRL.
*/
- if (!(ia32_cap & ARCH_CAP_TAA_NO) &&
+ if (!(x86_arch_cap_msr & ARCH_CAP_TAA_NO) &&
(cpu_has(c, X86_FEATURE_RTM) ||
- (ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
+ (x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
/*
@@ -1418,7 +1403,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist,
* nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits.
*/
- if (!arch_cap_mmio_immune(ia32_cap)) {
+ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) {
if (cpu_matches(cpu_vuln_blacklist, MMIO))
setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA);
else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO))
@@ -1426,7 +1411,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
}
if (!cpu_has(c, X86_FEATURE_BTC_NO)) {
- if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (ia32_cap & ARCH_CAP_RSBA))
+ if (cpu_matches(cpu_vuln_blacklist, RETBLEED) || (x86_arch_cap_msr & ARCH_CAP_RSBA))
setup_force_cpu_bug(X86_BUG_RETBLEED);
}
@@ -1444,15 +1429,28 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
* disabling AVX2. The only way to do this in HW is to clear XCR0[2],
* which means that AVX will be disabled.
*/
- if (cpu_matches(cpu_vuln_blacklist, GDS) && !(ia32_cap & ARCH_CAP_GDS_NO) &&
+ if (cpu_matches(cpu_vuln_blacklist, GDS) && !(x86_arch_cap_msr & ARCH_CAP_GDS_NO) &&
boot_cpu_has(X86_FEATURE_AVX))
setup_force_cpu_bug(X86_BUG_GDS);
+ if (vulnerable_to_rfds(x86_arch_cap_msr))
+ setup_force_cpu_bug(X86_BUG_RFDS);
+
+ /* When virtualized, eIBRS could be hidden, assume vulnerable */
+ if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) &&
+ !cpu_matches(cpu_vuln_whitelist, NO_BHI) &&
+ (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) ||
+ boot_cpu_has(X86_FEATURE_HYPERVISOR)))
+ setup_force_cpu_bug(X86_BUG_BHI);
+
+ if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
+ setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
+
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
- if (ia32_cap & ARCH_CAP_RDCL_NO)
+ if (x86_arch_cap_msr & ARCH_CAP_RDCL_NO)
return;
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
@@ -1515,6 +1513,11 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "nousershstk"))
setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK);
+ /* Minimize the gap between FRED is available and available but disabled. */
+ arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg));
+ if (arglen != 2 || strncmp(arg, "on", 2))
+ setup_clear_cpu_cap(X86_FEATURE_FRED);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
if (arglen <= 0)
return;
@@ -1594,10 +1597,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (have_cpuid_p()) {
cpu_detect(c);
get_cpu_vendor(c);
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
setup_force_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
cpu_parse_early_param();
+ cpu_init_topology(c);
+
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
@@ -1608,10 +1615,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
this_cpu->c_bsp_init(c);
} else {
setup_clear_cpu_cap(X86_FEATURE_CPUID);
+ get_cpu_address_sizes(c);
+ cpu_init_topology(c);
}
- get_cpu_address_sizes(c);
-
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
cpu_set_bug_bits(c);
@@ -1644,15 +1651,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
detect_nopl();
}
-void __init early_cpu_init(void)
+void __init init_cpu_devs(void)
{
const struct cpu_dev *const *cdev;
int count = 0;
-#ifdef CONFIG_PROCESSOR_SELECT
- pr_info("KERNEL supported cpus:\n");
-#endif
-
for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
const struct cpu_dev *cpudev = *cdev;
@@ -1660,20 +1663,30 @@ void __init early_cpu_init(void)
break;
cpu_devs[count] = cpudev;
count++;
+ }
+}
+void __init early_cpu_init(void)
+{
#ifdef CONFIG_PROCESSOR_SELECT
- {
- unsigned int j;
-
- for (j = 0; j < 2; j++) {
- if (!cpudev->c_ident[j])
- continue;
- pr_info(" %s %s\n", cpudev->c_vendor,
- cpudev->c_ident[j]);
- }
- }
+ unsigned int i, j;
+
+ pr_info("KERNEL supported cpus:\n");
#endif
+
+ init_cpu_devs();
+
+#ifdef CONFIG_PROCESSOR_SELECT
+ for (i = 0; i < X86_VENDOR_NUM && cpu_devs[i]; i++) {
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devs[i]->c_ident[j])
+ continue;
+ pr_info(" %s %s\n", cpu_devs[i]->c_vendor,
+ cpu_devs[i]->c_ident[j]);
+ }
}
+#endif
+
early_identify_cpu(&boot_cpu_data);
}
@@ -1750,23 +1763,11 @@ static void generic_identify(struct cpuinfo_x86 *c)
cpu_detect(c);
get_cpu_vendor(c);
-
+ intel_unlock_cpuid_leafs(c);
get_cpu_cap(c);
get_cpu_address_sizes(c);
- if (c->cpuid_level >= 0x00000001) {
- c->topo.initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
-#ifdef CONFIG_X86_32
-# ifdef CONFIG_SMP
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
-# else
- c->topo.apicid = c->topo.initial_apicid;
-# endif
-#endif
- c->topo.pkg_id = c->topo.initial_apicid;
- }
-
get_model_name(c); /* Default name */
/*
@@ -1788,29 +1789,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
}
/*
- * Validate that ACPI/mptables have the same information about the
- * effective APIC id and update the package map.
- */
-static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int cpu = smp_processor_id();
- u32 apicid;
-
- apicid = apic->cpu_present_to_apicid(cpu);
-
- if (apicid != c->topo.apicid) {
- pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
- cpu, apicid, c->topo.initial_apicid);
- }
- BUG_ON(topology_update_package_map(c->topo.pkg_id, cpu));
- BUG_ON(topology_update_die_map(c->topo.die_id, cpu));
-#else
- c->topo.logical_pkg_id = 0;
-#endif
-}
-
-/*
* This does the hard work of actually picking apart the CPU stuff...
*/
static void identify_cpu(struct cpuinfo_x86 *c)
@@ -1823,11 +1801,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86_model = c->x86_stepping = 0; /* So far unknown... */
c->x86_vendor_id[0] = '\0'; /* Unset */
c->x86_model_id[0] = '\0'; /* Unset */
- c->x86_max_cores = 1;
- c->x86_coreid_bits = 0;
- c->topo.cu_id = 0xff;
- c->topo.llc_id = BAD_APICID;
- c->topo.l2c_id = BAD_APICID;
#ifdef CONFIG_X86_64
c->x86_clflush_size = 64;
c->x86_phys_bits = 36;
@@ -1846,15 +1819,19 @@ static void identify_cpu(struct cpuinfo_x86 *c)
generic_identify(c);
+ cpu_parse_topology(c);
+
if (this_cpu->c_identify)
this_cpu->c_identify(c);
/* Clear/Set all flags overridden by options, after probe */
apply_forced_caps(c);
-#ifdef CONFIG_X86_64
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
-#endif
+ /*
+ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
+ * Hygon will clear it in ->c_init() below.
+ */
+ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
/*
* Vendor-specific initialization. In this section we
@@ -1869,6 +1846,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_init)
this_cpu->c_init(c);
+ bus_lock_init();
+
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
@@ -1903,10 +1882,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
c->x86, c->x86_model);
}
-#ifdef CONFIG_X86_64
- detect_ht(c);
-#endif
-
x86_init_rdrand(c);
setup_pku(c);
setup_cet(c);
@@ -1938,11 +1913,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Init Machine Check Exception if available. */
mcheck_cpu_init(c);
- select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
-#endif
}
/*
@@ -1987,6 +1958,7 @@ static __init void identify_boot_cpu(void)
setup_cr_pinning();
tsx_init();
+ tdx_init();
lkgs_init();
}
@@ -1997,7 +1969,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_32
enable_sep_cpu();
#endif
- validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
if (boot_cpu_has_bug(X86_BUG_GDS))
@@ -2049,6 +2020,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
.top_of_stack = TOP_OF_INIT_STACK,
};
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
+EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
@@ -2066,10 +2038,8 @@ static void wrmsrl_cstar(unsigned long val)
wrmsrl(MSR_CSTAR, val);
}
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
+static inline void idt_syscall_init(void)
{
- wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (ia32_enabled()) {
@@ -2103,12 +2073,31 @@ void syscall_init(void)
X86_EFLAGS_AC|X86_EFLAGS_ID);
}
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+ /* The default user and kernel segments */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
+ /*
+ * Except the IA32_STAR MSR, there is NO need to setup SYSCALL and
+ * SYSENTER MSRs for FRED, because FRED uses the ring 3 FRED
+ * entrypoint for SYSCALL and SYSENTER, and ERETU is the only legit
+ * instruction to return to ring 3 (both sysexit and sysret cause
+ * #UD when FRED is enabled).
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_syscall_init();
+}
+
#else /* CONFIG_X86_64 */
#ifdef CONFIG_STACKPROTECTOR
DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+#ifndef CONFIG_SMP
EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
+#endif
#endif /* CONFIG_X86_64 */
@@ -2198,7 +2187,7 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
* Setup everything needed to handle exceptions from the IDT, including the IST
* exceptions which use paranoid_entry().
*/
-void cpu_init_exception_handling(void)
+void cpu_init_exception_handling(bool boot_cpu)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
int cpu = raw_smp_processor_id();
@@ -2206,8 +2195,9 @@ void cpu_init_exception_handling(void)
/* paranoid_entry() gets the CPU number from the GDT */
setup_getcpu(cpu);
- /* IST vectors need TSS to be set up. */
- tss_setup_ist(tss);
+ /* For IDT mode, IST vectors need to be set in TSS. */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ tss_setup_ist(tss);
tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
@@ -2216,8 +2206,23 @@ void cpu_init_exception_handling(void)
/* GHCB needs to be setup to handle #VC. */
setup_ghcb();
- /* Finally load the IDT */
- load_current_idt();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* The boot CPU has enabled FRED during early boot */
+ if (!boot_cpu)
+ cpu_init_fred_exceptions();
+
+ cpu_init_fred_rsps();
+ } else {
+ load_current_idt();
+ }
+}
+
+void __init cpu_init_replace_early_idt(void)
+{
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ cpu_init_fred_exceptions();
+ else
+ idt_setup_early_pf();
}
/*
@@ -2252,6 +2257,8 @@ void cpu_init(void)
barrier();
x2apic_setup();
+
+ intel_posted_msi_init();
}
mmgrab(&init_mm);
@@ -2340,13 +2347,17 @@ void arch_smt_update(void)
void __init arch_cpu_finalize_init(void)
{
+ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+
identify_boot_cpu();
+ select_idle_routine();
+
/*
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_set_num_threads(smp_num_siblings, smp_num_siblings);
+ cpu_smt_set_num_threads(__max_threads_per_core, __max_threads_per_core);
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
@@ -2376,9 +2387,25 @@ void __init arch_cpu_finalize_init(void)
fpu__init_system();
fpu__init_cpu();
+ /*
+ * Ensure that access to the per CPU representation has the initial
+ * boot CPU configuration.
+ */
+ *c = boot_cpu_data;
+ c->initialized = true;
+
alternative_instructions();
if (IS_ENABLED(CONFIG_X86_64)) {
+ unsigned long USER_PTR_MAX = TASK_SIZE_MAX;
+
+ /*
+ * Enable this when LAM is gated on LASS support
+ if (cpu_feature_enabled(X86_FEATURE_LAM))
+ USER_PTR_MAX = (1ul << 63) - PAGE_SIZE;
+ */
+ runtime_const_init(ptr, USER_PTR_MAX);
+
/*
* Make sure the first 2MB area is not mapped by huge pages
* There are typically fixed size MTRRs in there and overlapping
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 885281ae79a5..1beccefbaff9 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -2,6 +2,11 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
+#include <asm/cpu.h>
+#include <asm/topology.h>
+
+#include "topology.h"
+
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -56,9 +61,11 @@ extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state;
extern void __init tsx_init(void);
void tsx_ap_init(void);
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c);
#else
static inline void tsx_init(void) { }
static inline void tsx_ap_init(void) { }
+static inline void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c) { }
#endif /* CONFIG_CPU_SUP_INTEL */
extern void init_spectral_chicken(struct cpuinfo_x86 *c);
@@ -71,14 +78,9 @@ extern void init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
extern void init_hygon_cacheinfo(struct cpuinfo_x86 *c);
-extern void detect_num_cpu_cores(struct cpuinfo_x86 *c);
-extern int detect_extended_topology_early(struct cpuinfo_x86 *c);
-extern int detect_extended_topology(struct cpuinfo_x86 *c);
-extern int detect_ht_early(struct cpuinfo_x86 *c);
-extern void detect_ht(struct cpuinfo_x86 *c);
extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c);
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id);
void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
@@ -96,4 +98,5 @@ static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode)
mode == SPECTRE_V2_EIBRS_RETPOLINE ||
mode == SPECTRE_V2_EIBRS_LFENCE;
}
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index e462c1d3800a..df838e3bdbe0 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -44,7 +44,11 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_F16C, X86_FEATURE_XMM2, },
{ X86_FEATURE_AES, X86_FEATURE_XMM2 },
{ X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_GFNI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX },
{ X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX },
{ X86_FEATURE_AVX2, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
{ X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
@@ -56,9 +60,6 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
{ X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
- { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
- { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
@@ -82,6 +83,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_XFD, X86_FEATURE_XGETBV1 },
{ X86_FEATURE_AMX_TILE, X86_FEATURE_XFD },
{ X86_FEATURE_SHSTK, X86_FEATURE_XSAVES },
+ { X86_FEATURE_FRED, X86_FEATURE_LKGS },
{}
};
@@ -112,6 +114,9 @@ static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
if (WARN_ON(feature >= MAX_FEATURE_BITS))
return;
+ if (boot_cpu_has(feature))
+ WARN_ON(alternatives_patched);
+
clear_feature(c, feature);
/* Collect all features to disable, handling dependencies */
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 9651275aecd1..dfec2c61e354 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -153,8 +153,8 @@ static void geode_configure(void)
u8 ccr3;
local_irq_save(flags);
- /* Suspend on halt power saving and enable #SUSP pin */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+ /* Suspend on halt power saving */
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08);
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
index 0c179d684b3b..cacfd3f6abef 100644
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -5,6 +5,8 @@
#include <asm/apic.h>
#include <asm/processor.h>
+#include "cpu.h"
+
static int cpu_debug_show(struct seq_file *m, void *p)
{
unsigned long cpu = (unsigned long)m->private;
@@ -20,13 +22,18 @@ static int cpu_debug_show(struct seq_file *m, void *p)
seq_printf(m, "die_id: %u\n", c->topo.die_id);
seq_printf(m, "cu_id: %u\n", c->topo.cu_id);
seq_printf(m, "core_id: %u\n", c->topo.core_id);
+ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
+ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
- seq_printf(m, "max_cores: %u\n", c->x86_max_cores);
- seq_printf(m, "max_die_per_pkg: %u\n", __max_die_per_package);
- seq_printf(m, "smp_num_siblings: %u\n", smp_num_siblings);
+ seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
+ seq_printf(m, "amd_nodes_per_pkg: %u\n", topology_amd_nodes_per_pkg());
+ seq_printf(m, "num_threads: %u\n", __num_threads_per_package);
+ seq_printf(m, "num_cores: %u\n", __num_cores_per_package);
+ seq_printf(m, "max_dies_per_pkg: %u\n", __max_dies_per_package);
+ seq_printf(m, "max_threads_per_core:%u\n", __max_threads_per_core);
return 0;
}
@@ -42,12 +49,48 @@ static const struct file_operations dfs_cpu_ops = {
.release = single_release,
};
+static int dom_debug_show(struct seq_file *m, void *p)
+{
+ static const char *domain_names[TOPO_MAX_DOMAIN] = {
+ [TOPO_SMT_DOMAIN] = "Thread",
+ [TOPO_CORE_DOMAIN] = "Core",
+ [TOPO_MODULE_DOMAIN] = "Module",
+ [TOPO_TILE_DOMAIN] = "Tile",
+ [TOPO_DIE_DOMAIN] = "Die",
+ [TOPO_DIEGRP_DOMAIN] = "DieGrp",
+ [TOPO_PKG_DOMAIN] = "Package",
+ };
+ unsigned int dom, nthreads = 1;
+
+ for (dom = 0; dom < TOPO_MAX_DOMAIN; dom++) {
+ nthreads *= x86_topo_system.dom_size[dom];
+ seq_printf(m, "domain: %-10s shift: %u dom_size: %5u max_threads: %5u\n",
+ domain_names[dom], x86_topo_system.dom_shifts[dom],
+ x86_topo_system.dom_size[dom], nthreads);
+ }
+ return 0;
+}
+
+static int dom_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dom_debug_show, inode->i_private);
+}
+
+static const struct file_operations dfs_dom_ops = {
+ .open = dom_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static __init int cpu_init_debugfs(void)
{
struct dentry *dir, *base = debugfs_create_dir("topo", arch_debugfs_dir);
unsigned long id;
char name[24];
+ debugfs_create_file("domains", 0444, base, NULL, &dfs_dom_ops);
+
dir = debugfs_create_dir("cpus", base);
for_each_possible_cpu(id) {
sprintf(name, "%lu", id);
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 03851240c3e3..4a4118784c13 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
if (ept & VMX_EPT_1GB_PAGE_BIT)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
/* Synthetic APIC features that are aggregates of multiple features. */
if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
@@ -186,7 +188,7 @@ update_caps:
update_sgx:
if (!(msr & FEAT_CTL_SGX_ENABLED)) {
if (enable_sgx_kvm || enable_sgx_driver)
- pr_err_once("SGX disabled by BIOS.\n");
+ pr_err_once("SGX disabled or unsupported by BIOS.\n");
clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index 6f247d66758d..c5191b06f9f2 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -18,14 +18,6 @@
#include "cpu.h"
-#define APICID_SOCKET_ID_BIT 6
-
-/*
- * nodes_per_socket: Stores the number of nodes per socket.
- * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8]
- */
-static u32 nodes_per_socket = 1;
-
#ifdef CONFIG_NUMA
/*
* To workaround broken NUMA config. Read the comment in
@@ -49,80 +41,6 @@ static int nearby_node(int apicid)
}
#endif
-static void hygon_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
-/*
- * Fixup core topology information for
- * (1) Hygon multi-node processors
- * Assumption: Number of cores in each internal node is the same.
- * (2) Hygon processors supporting compute units
- */
-static void hygon_get_topology(struct cpuinfo_x86 *c)
-{
- /* get information required for multi-node processors */
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- int err;
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-
- c->topo.die_id = ecx & 0xff;
-
- c->topo.core_id = ebx & 0xff;
-
- if (smp_num_siblings > 1)
- c->x86_max_cores /= smp_num_siblings;
-
- /*
- * In case leaf B is available, use it to derive
- * topology information.
- */
- err = detect_extended_topology(c);
- if (!err)
- c->x86_coreid_bits = get_count_order(c->x86_max_cores);
-
- /*
- * Socket ID is ApicId[6] for the processors with model <= 0x3
- * when running on host.
- */
- if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && c->x86_model <= 0x3)
- c->topo.pkg_id = c->topo.apicid >> APICID_SOCKET_ID_BIT;
-
- cacheinfo_hygon_init_llc_id(c);
- } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- c->topo.die_id = value & 7;
- c->topo.llc_id = c->topo.die_id;
- } else
- return;
-
- if (nodes_per_socket > 1)
- set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-}
-
-/*
- * On Hygon setup the lower bits of the APIC id distinguish the cores.
- * Assumes number of cores is a power of two.
- */
-static void hygon_detect_cmp(struct cpuinfo_x86 *c)
-{
- unsigned int bits;
-
- bits = c->x86_coreid_bits;
- /* Low order bits define the core id (index of core in socket) */
- c->topo.core_id = c->topo.initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->topo.pkg_id = c->topo.initial_apicid >> bits;
- /* Use package ID also for last level cache */
- c->topo.llc_id = c->topo.die_id = c->topo.pkg_id;
-}
-
static void srat_detect_node(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_NUMA
@@ -173,32 +91,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void early_init_hygon_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned int bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-#endif
-}
-
static void bsp_init_hygon(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
@@ -212,18 +104,6 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
- if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
- u32 ecx;
-
- ecx = cpuid_ecx(0x8000001e);
- __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
- } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
- u64 value;
-
- rdmsrl(MSR_FAM10H_NODE_ID, value);
- __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
- }
-
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
!boot_cpu_has(X86_FEATURE_VIRT_SSBD)) {
/*
@@ -242,8 +122,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
{
u32 dummy;
- early_init_hygon_mc(c);
-
set_cpu_cap(c, X86_FEATURE_K8);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
@@ -284,8 +162,6 @@ static void early_init_hygon(struct cpuinfo_x86 *c)
* we can set it unconditionally.
*/
set_cpu_cap(c, X86_FEATURE_VMMCALL);
-
- hygon_get_topology_early(c);
}
static void init_hygon(struct cpuinfo_x86 *c)
@@ -302,9 +178,6 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->topo.apicid = read_apic_id();
-
/*
* XXX someone from Hygon needs to confirm this DTRT
*
@@ -316,8 +189,6 @@ static void init_hygon(struct cpuinfo_x86 *c)
cpu_detect_cache_sizes(c);
- hygon_detect_cmp(c);
- hygon_get_topology(c);
srat_detect_node(c);
init_hygon_cacheinfo(c);
@@ -354,6 +225,9 @@ static void init_hygon(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
check_null_seg_clears_base(c);
+
+ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
+ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
}
static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a927a8fc9624..134368a3f4b1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -7,13 +7,9 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
-#include <linux/semaphore.h>
#include <linux/thread_info.h>
#include <linux/init.h>
#include <linux/uaccess.h>
-#include <linux/workqueue.h>
-#include <linux/delay.h>
-#include <linux/cpuhotplug.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
@@ -24,8 +20,6 @@
#include <asm/hwcap2.h>
#include <asm/elf.h>
#include <asm/cpu_device_id.h>
-#include <asm/cmdline.h>
-#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
#include <asm/thermal.h>
@@ -41,28 +35,6 @@
#include <asm/apic.h>
#endif
-enum split_lock_detect_state {
- sld_off = 0,
- sld_warn,
- sld_fatal,
- sld_ratelimit,
-};
-
-/*
- * Default to sld_off because most systems do not support split lock detection.
- * sld_state_setup() will switch this to sld_warn on systems that support
- * split lock/bus lock detect, unless there is a command line override.
- */
-static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
-static u64 msr_test_ctrl_cache __ro_after_init;
-
-/*
- * With a name like MSR_TEST_CTL it should go without saying, but don't touch
- * MSR_TEST_CTL unless the CPU is one of the whitelisted models. Writing it
- * on CPUs that do not support SLD can cause fireworks, even when writing '0'.
- */
-static bool cpu_model_supports_sld __ro_after_init;
-
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -72,19 +44,19 @@ static bool cpu_model_supports_sld __ro_after_init;
*/
static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
{
- switch (c->x86_model) {
- case INTEL_FAM6_CORE_YONAH:
- case INTEL_FAM6_CORE2_MEROM:
- case INTEL_FAM6_CORE2_MEROM_L:
- case INTEL_FAM6_CORE2_PENRYN:
- case INTEL_FAM6_CORE2_DUNNINGTON:
- case INTEL_FAM6_NEHALEM:
- case INTEL_FAM6_NEHALEM_G:
- case INTEL_FAM6_NEHALEM_EP:
- case INTEL_FAM6_NEHALEM_EX:
- case INTEL_FAM6_WESTMERE:
- case INTEL_FAM6_WESTMERE_EP:
- case INTEL_FAM6_SANDYBRIDGE:
+ switch (c->x86_vfm) {
+ case INTEL_CORE_YONAH:
+ case INTEL_CORE2_MEROM:
+ case INTEL_CORE2_MEROM_L:
+ case INTEL_CORE2_PENRYN:
+ case INTEL_CORE2_DUNNINGTON:
+ case INTEL_NEHALEM:
+ case INTEL_NEHALEM_G:
+ case INTEL_NEHALEM_EP:
+ case INTEL_NEHALEM_EX:
+ case INTEL_WESTMERE:
+ case INTEL_WESTMERE_EP:
+ case INTEL_SANDYBRIDGE:
setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
}
}
@@ -106,9 +78,9 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
*/
if (c->x86 != 6)
return;
- switch (c->x86_model) {
- case INTEL_FAM6_XEON_PHI_KNL:
- case INTEL_FAM6_XEON_PHI_KNM:
+ switch (c->x86_vfm) {
+ case INTEL_XEON_PHI_KNL:
+ case INTEL_XEON_PHI_KNM:
break;
default:
return;
@@ -134,32 +106,32 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
* - Release note from 20180108 microcode release
*/
struct sku_microcode {
- u8 model;
+ u32 vfm;
u8 stepping;
u32 microcode;
};
static const struct sku_microcode spectre_bad_microcodes[] = {
- { INTEL_FAM6_KABYLAKE, 0x0B, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE, 0x09, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x0A, 0x80 },
- { INTEL_FAM6_KABYLAKE_L, 0x09, 0x80 },
- { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e },
- { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c },
- { INTEL_FAM6_BROADWELL, 0x04, 0x28 },
- { INTEL_FAM6_BROADWELL_G, 0x01, 0x1b },
- { INTEL_FAM6_BROADWELL_D, 0x02, 0x14 },
- { INTEL_FAM6_BROADWELL_D, 0x03, 0x07000011 },
- { INTEL_FAM6_BROADWELL_X, 0x01, 0x0b000025 },
- { INTEL_FAM6_HASWELL_L, 0x01, 0x21 },
- { INTEL_FAM6_HASWELL_G, 0x01, 0x18 },
- { INTEL_FAM6_HASWELL, 0x03, 0x23 },
- { INTEL_FAM6_HASWELL_X, 0x02, 0x3b },
- { INTEL_FAM6_HASWELL_X, 0x04, 0x10 },
- { INTEL_FAM6_IVYBRIDGE_X, 0x04, 0x42a },
+ { INTEL_KABYLAKE, 0x0B, 0x80 },
+ { INTEL_KABYLAKE, 0x0A, 0x80 },
+ { INTEL_KABYLAKE, 0x09, 0x80 },
+ { INTEL_KABYLAKE_L, 0x0A, 0x80 },
+ { INTEL_KABYLAKE_L, 0x09, 0x80 },
+ { INTEL_SKYLAKE_X, 0x03, 0x0100013e },
+ { INTEL_SKYLAKE_X, 0x04, 0x0200003c },
+ { INTEL_BROADWELL, 0x04, 0x28 },
+ { INTEL_BROADWELL_G, 0x01, 0x1b },
+ { INTEL_BROADWELL_D, 0x02, 0x14 },
+ { INTEL_BROADWELL_D, 0x03, 0x07000011 },
+ { INTEL_BROADWELL_X, 0x01, 0x0b000025 },
+ { INTEL_HASWELL_L, 0x01, 0x21 },
+ { INTEL_HASWELL_G, 0x01, 0x18 },
+ { INTEL_HASWELL, 0x03, 0x23 },
+ { INTEL_HASWELL_X, 0x02, 0x3b },
+ { INTEL_HASWELL_X, 0x04, 0x10 },
+ { INTEL_IVYBRIDGE_X, 0x04, 0x42a },
/* Observed in the wild */
- { INTEL_FAM6_SANDYBRIDGE_X, 0x06, 0x61b },
- { INTEL_FAM6_SANDYBRIDGE_X, 0x07, 0x712 },
+ { INTEL_SANDYBRIDGE_X, 0x06, 0x61b },
+ { INTEL_SANDYBRIDGE_X, 0x07, 0x712 },
};
static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
@@ -173,29 +145,70 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_HYPERVISOR))
return false;
- if (c->x86 != 6)
- return false;
-
for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
- if (c->x86_model == spectre_bad_microcodes[i].model &&
+ if (c->x86_vfm == spectre_bad_microcodes[i].vfm &&
c->x86_stepping == spectre_bad_microcodes[i].stepping)
return (c->microcode <= spectre_bad_microcodes[i].microcode);
}
return false;
}
-static void early_init_intel(struct cpuinfo_x86 *c)
+#define MSR_IA32_TME_ACTIVATE 0x982
+
+/* Helpers to access TME_ACTIVATE MSR */
+#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
+#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
+
+#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
+
+static void detect_tme_early(struct cpuinfo_x86 *c)
{
- u64 misc_enable;
+ u64 tme_activate;
+ int keyid_bits;
- /* Unmask CPUID levels if masked: */
- if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
- c->cpuid_level = cpuid_eax(0);
- get_cpu_cap(c);
- }
+ rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
+
+ if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
+ pr_info_once("x86/tme: not enabled by BIOS\n");
+ clear_cpu_cap(c, X86_FEATURE_TME);
+ return;
}
+ pr_info_once("x86/tme: enabled by BIOS\n");
+ keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
+ if (!keyid_bits)
+ return;
+
+ /*
+ * KeyID bits are set by BIOS and can be present regardless
+ * of whether the kernel is using them. They effectively lower
+ * the number of physical address bits.
+ *
+ * Update cpuinfo_x86::x86_phys_bits accordingly.
+ */
+ c->x86_phys_bits -= keyid_bits;
+ pr_info_once("x86/mktme: BIOS enabled: x86_phys_bits reduced by %d\n",
+ keyid_bits);
+}
+
+void intel_unlock_cpuid_leafs(struct cpuinfo_x86 *c)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return;
+
+ if (c->x86 < 6 || (c->x86 == 6 && c->x86_model < 0xd))
+ return;
+
+ /*
+ * The BIOS can have limited CPUID to leaf 2, which breaks feature
+ * enumeration. Unlock it and update the maximum leaf info.
+ */
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE, MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0)
+ c->cpuid_level = cpuid_eax(0);
+}
+
+static void early_init_intel(struct cpuinfo_x86 *c)
+{
+ u64 misc_enable;
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
@@ -228,7 +241,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* need the microcode to have already been loaded... so if it is
* not, recommend a BIOS update and disable large pages.
*/
- if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_stepping <= 2 &&
+ if (c->x86_vfm == INTEL_ATOM_BONNELL && c->x86_stepping <= 2 &&
c->microcode < 0x20e) {
pr_warn("Atom PSE erratum detected, BIOS microcode update recommended\n");
clear_cpu_cap(c, X86_FEATURE_PSE);
@@ -260,30 +273,28 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
- if (c->x86 == 6) {
- switch (c->x86_model) {
- case INTEL_FAM6_ATOM_SALTWELL_MID:
- case INTEL_FAM6_ATOM_SALTWELL_TABLET:
- case INTEL_FAM6_ATOM_SILVERMONT_MID:
- case INTEL_FAM6_ATOM_AIRMONT_NP:
- set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
- break;
- default:
- break;
- }
+ switch (c->x86_vfm) {
+ case INTEL_ATOM_SALTWELL_MID:
+ case INTEL_ATOM_SALTWELL_TABLET:
+ case INTEL_ATOM_SILVERMONT_MID:
+ case INTEL_ATOM_AIRMONT_NP:
+ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3);
+ break;
}
/*
- * There is a known erratum on Pentium III and Core Solo
- * and Core Duo CPUs.
- * " Page with PAT set to WC while associated MTRR is UC
- * may consolidate to UC "
- * Because of this erratum, it is better to stick with
- * setting WC in MTRR rather than using PAT on these CPUs.
+ * PAT is broken on early family 6 CPUs, the last of which
+ * is "Yonah" where the erratum is named "AN7":
*
- * Enable PAT WC only on P4, Core 2 or later CPUs.
+ * Page with PAT (Page Attribute Table) Set to USWC
+ * (Uncacheable Speculative Write Combine) While
+ * Associated MTRR (Memory Type Range Register) Is UC
+ * (Uncacheable) May Consolidate to UC
+ *
+ * Disable PAT and fall back to MTRR on these CPUs.
*/
- if (c->x86 == 6 && c->x86_model < 15)
+ if (c->x86_vfm >= INTEL_PENTIUM_PRO &&
+ c->x86_vfm <= INTEL_CORE_YONAH)
clear_cpu_cap(c, X86_FEATURE_PAT);
/*
@@ -309,7 +320,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
- if (c->x86 == 5 && c->x86_model == 9) {
+ if (c->x86_vfm == INTEL_QUARK_X1000) {
pr_info("Disabling PGE capability bit\n");
setup_clear_cpu_cap(X86_FEATURE_PGE);
}
@@ -317,11 +328,11 @@ static void early_init_intel(struct cpuinfo_x86 *c)
check_memory_type_self_snoop_errata(c);
/*
- * Get the number of SMT siblings early from the extended topology
- * leaf, if available. Otherwise try the legacy SMT detection.
+ * Adjust the number of physical bits early because it affects the
+ * valid bits of the MTRR mask registers.
*/
- if (detect_extended_topology_early(c) < 0)
- detect_ht_early(c);
+ if (cpu_has(c, X86_FEATURE_TME))
+ detect_tme_early(c);
}
static void bsp_init_intel(struct cpuinfo_x86 *c)
@@ -482,90 +493,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-#define MSR_IA32_TME_ACTIVATE 0x982
-
-/* Helpers to access TME_ACTIVATE MSR */
-#define TME_ACTIVATE_LOCKED(x) (x & 0x1)
-#define TME_ACTIVATE_ENABLED(x) (x & 0x2)
-
-#define TME_ACTIVATE_POLICY(x) ((x >> 4) & 0xf) /* Bits 7:4 */
-#define TME_ACTIVATE_POLICY_AES_XTS_128 0
-
-#define TME_ACTIVATE_KEYID_BITS(x) ((x >> 32) & 0xf) /* Bits 35:32 */
-
-#define TME_ACTIVATE_CRYPTO_ALGS(x) ((x >> 48) & 0xffff) /* Bits 63:48 */
-#define TME_ACTIVATE_CRYPTO_AES_XTS_128 1
-
-/* Values for mktme_status (SW only construct) */
-#define MKTME_ENABLED 0
-#define MKTME_DISABLED 1
-#define MKTME_UNINITIALIZED 2
-static int mktme_status = MKTME_UNINITIALIZED;
-
-static void detect_tme(struct cpuinfo_x86 *c)
-{
- u64 tme_activate, tme_policy, tme_crypto_algs;
- int keyid_bits = 0, nr_keyids = 0;
- static u64 tme_activate_cpu0 = 0;
-
- rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate);
-
- if (mktme_status != MKTME_UNINITIALIZED) {
- if (tme_activate != tme_activate_cpu0) {
- /* Broken BIOS? */
- pr_err_once("x86/tme: configuration is inconsistent between CPUs\n");
- pr_err_once("x86/tme: MKTME is not usable\n");
- mktme_status = MKTME_DISABLED;
-
- /* Proceed. We may need to exclude bits from x86_phys_bits. */
- }
- } else {
- tme_activate_cpu0 = tme_activate;
- }
-
- if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) {
- pr_info_once("x86/tme: not enabled by BIOS\n");
- mktme_status = MKTME_DISABLED;
- return;
- }
-
- if (mktme_status != MKTME_UNINITIALIZED)
- goto detect_keyid_bits;
-
- pr_info("x86/tme: enabled by BIOS\n");
-
- tme_policy = TME_ACTIVATE_POLICY(tme_activate);
- if (tme_policy != TME_ACTIVATE_POLICY_AES_XTS_128)
- pr_warn("x86/tme: Unknown policy is active: %#llx\n", tme_policy);
-
- tme_crypto_algs = TME_ACTIVATE_CRYPTO_ALGS(tme_activate);
- if (!(tme_crypto_algs & TME_ACTIVATE_CRYPTO_AES_XTS_128)) {
- pr_err("x86/mktme: No known encryption algorithm is supported: %#llx\n",
- tme_crypto_algs);
- mktme_status = MKTME_DISABLED;
- }
-detect_keyid_bits:
- keyid_bits = TME_ACTIVATE_KEYID_BITS(tme_activate);
- nr_keyids = (1UL << keyid_bits) - 1;
- if (nr_keyids) {
- pr_info_once("x86/mktme: enabled by BIOS\n");
- pr_info_once("x86/mktme: %d KeyIDs available\n", nr_keyids);
- } else {
- pr_info_once("x86/mktme: disabled by BIOS\n");
- }
-
- if (mktme_status == MKTME_UNINITIALIZED) {
- /* MKTME is usable */
- mktme_status = MKTME_ENABLED;
- }
-
- /*
- * KeyID bits effectively lower the number of physical address
- * bits. Update cpuinfo_x86::x86_phys_bits accordingly.
- */
- c->x86_phys_bits -= keyid_bits;
-}
-
static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
@@ -594,33 +521,12 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
-static void split_lock_init(void);
-static void bus_lock_init(void);
-
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
intel_workarounds(c);
- /*
- * Detect the extended topology information if available. This
- * will reinitialise the initial_apicid which will be used
- * in init_intel_cacheinfo()
- */
- detect_extended_topology(c);
-
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
@@ -643,12 +549,15 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_CLFLUSH) &&
- (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
+ if (boot_cpu_has(X86_FEATURE_CLFLUSH) &&
+ (c->x86_vfm == INTEL_CORE2_DUNNINGTON ||
+ c->x86_vfm == INTEL_NEHALEM_EX ||
+ c->x86_vfm == INTEL_WESTMERE_EX))
set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
- if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) &&
- ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT)))
+ if (boot_cpu_has(X86_FEATURE_MWAIT) &&
+ (c->x86_vfm == INTEL_ATOM_GOLDMONT ||
+ c->x86_vfm == INTEL_LUNARLAKE_M))
set_cpu_bug(c, X86_BUG_MONITOR);
#ifdef CONFIG_X86_64
@@ -690,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c)
if (p)
strcpy(c->x86_model_id, p);
}
-
- if (c->x86 == 15)
- set_cpu_cap(c, X86_FEATURE_P4);
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_P3);
#endif
/* Work around errata */
@@ -702,13 +606,9 @@ static void init_intel(struct cpuinfo_x86 *c)
init_ia32_feat_ctl(c);
- if (cpu_has(c, X86_FEATURE_TME))
- detect_tme(c);
-
init_intel_misc_features(c);
split_lock_init();
- bus_lock_init();
intel_init_thermal(c);
}
@@ -735,26 +635,37 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-#define TLB_INST_4K 0x01
-#define TLB_INST_4M 0x02
-#define TLB_INST_2M_4M 0x03
+#define TLB_INST_4K 0x01
+#define TLB_INST_4M 0x02
+#define TLB_INST_2M_4M 0x03
-#define TLB_INST_ALL 0x05
-#define TLB_INST_1G 0x06
+#define TLB_INST_ALL 0x05
+#define TLB_INST_1G 0x06
-#define TLB_DATA_4K 0x11
-#define TLB_DATA_4M 0x12
-#define TLB_DATA_2M_4M 0x13
-#define TLB_DATA_4K_4M 0x14
+#define TLB_DATA_4K 0x11
+#define TLB_DATA_4M 0x12
+#define TLB_DATA_2M_4M 0x13
+#define TLB_DATA_4K_4M 0x14
-#define TLB_DATA_1G 0x16
+#define TLB_DATA_1G 0x16
+#define TLB_DATA_1G_2M_4M 0x17
-#define TLB_DATA0_4K 0x21
-#define TLB_DATA0_4M 0x22
-#define TLB_DATA0_2M_4M 0x23
+#define TLB_DATA0_4K 0x21
+#define TLB_DATA0_4M 0x22
+#define TLB_DATA0_2M_4M 0x23
-#define STLB_4K 0x41
-#define STLB_4K_2M 0x42
+#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
+
+/*
+ * All of leaf 0x2's one-byte TLB descriptors implies the same number of
+ * entries for their respective TLB types. The 0x63 descriptor is an
+ * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries
+ * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for
+ * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the
+ * intel_tlb_table[] mapping.
+ */
+#define TLB_0x63_2M_4M_ENTRIES 32
static const struct _tlb_table intel_tlb_table[] = {
{ 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -776,7 +687,8 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
- { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative"
+ " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" },
{ 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" },
{ 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" },
{ 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" },
@@ -876,6 +788,12 @@ static void intel_tlb_lookup(const unsigned char desc)
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
+ case TLB_DATA_1G_2M_4M:
+ if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
+ tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
+ if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES)
+ tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES;
+ fallthrough;
case TLB_DATA_1G:
if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
@@ -899,7 +817,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
/* If bit 31 is set, this is an unknown format */
- for (j = 0 ; j < 3 ; j++)
+ for (j = 0 ; j < 4 ; j++)
if (regs[j] & (1 << 31))
regs[j] = 0;
@@ -974,381 +892,6 @@ static const struct cpu_dev intel_cpu_dev = {
cpu_dev_register(intel_cpu_dev);
-#undef pr_fmt
-#define pr_fmt(fmt) "x86/split lock detection: " fmt
-
-static const struct {
- const char *option;
- enum split_lock_detect_state state;
-} sld_options[] __initconst = {
- { "off", sld_off },
- { "warn", sld_warn },
- { "fatal", sld_fatal },
- { "ratelimit:", sld_ratelimit },
-};
-
-static struct ratelimit_state bld_ratelimit;
-
-static unsigned int sysctl_sld_mitigate = 1;
-static DEFINE_SEMAPHORE(buslock_sem, 1);
-
-#ifdef CONFIG_PROC_SYSCTL
-static struct ctl_table sld_sysctls[] = {
- {
- .procname = "split_lock_mitigate",
- .data = &sysctl_sld_mitigate,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_douintvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-};
-
-static int __init sld_mitigate_sysctl_init(void)
-{
- register_sysctl_init("kernel", sld_sysctls);
- return 0;
-}
-
-late_initcall(sld_mitigate_sysctl_init);
-#endif
-
-static inline bool match_option(const char *arg, int arglen, const char *opt)
-{
- int len = strlen(opt), ratelimit;
-
- if (strncmp(arg, opt, len))
- return false;
-
- /*
- * Min ratelimit is 1 bus lock/sec.
- * Max ratelimit is 1000 bus locks/sec.
- */
- if (sscanf(arg, "ratelimit:%d", &ratelimit) == 1 &&
- ratelimit > 0 && ratelimit <= 1000) {
- ratelimit_state_init(&bld_ratelimit, HZ, ratelimit);
- ratelimit_set_flags(&bld_ratelimit, RATELIMIT_MSG_ON_RELEASE);
- return true;
- }
-
- return len == arglen;
-}
-
-static bool split_lock_verify_msr(bool on)
-{
- u64 ctrl, tmp;
-
- if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
- return false;
- if (on)
- ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- else
- ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
- if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
- return false;
- rdmsrl(MSR_TEST_CTRL, tmp);
- return ctrl == tmp;
-}
-
-static void __init sld_state_setup(void)
-{
- enum split_lock_detect_state state = sld_warn;
- char arg[20];
- int i, ret;
-
- if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- return;
-
- ret = cmdline_find_option(boot_command_line, "split_lock_detect",
- arg, sizeof(arg));
- if (ret >= 0) {
- for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
- if (match_option(arg, ret, sld_options[i].option)) {
- state = sld_options[i].state;
- break;
- }
- }
- }
- sld_state = state;
-}
-
-static void __init __split_lock_setup(void)
-{
- if (!split_lock_verify_msr(false)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- if (!split_lock_verify_msr(true)) {
- pr_info("MSR access failed: Disabled\n");
- return;
- }
-
- /* Restore the MSR to its cached value. */
- wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
-
- setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
-}
-
-/*
- * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
- * is not implemented as one thread could undo the setting of the other
- * thread immediately after dropping the lock anyway.
- */
-static void sld_update_msr(bool on)
-{
- u64 test_ctrl_val = msr_test_ctrl_cache;
-
- if (on)
- test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
-
- wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
-}
-
-static void split_lock_init(void)
-{
- /*
- * #DB for bus lock handles ratelimit and #AC for split lock is
- * disabled.
- */
- if (sld_state == sld_ratelimit) {
- split_lock_verify_msr(false);
- return;
- }
-
- if (cpu_model_supports_sld)
- split_lock_verify_msr(sld_state != sld_off);
-}
-
-static void __split_lock_reenable_unlock(struct work_struct *work)
-{
- sld_update_msr(true);
- up(&buslock_sem);
-}
-
-static DECLARE_DELAYED_WORK(sl_reenable_unlock, __split_lock_reenable_unlock);
-
-static void __split_lock_reenable(struct work_struct *work)
-{
- sld_update_msr(true);
-}
-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
-
-/*
- * If a CPU goes offline with pending delayed work to re-enable split lock
- * detection then the delayed work will be executed on some other CPU. That
- * handles releasing the buslock_sem, but because it executes on a
- * different CPU probably won't re-enable split lock detection. This is a
- * problem on HT systems since the sibling CPU on the same core may then be
- * left running with split lock detection disabled.
- *
- * Unconditionally re-enable detection here.
- */
-static int splitlock_cpu_offline(unsigned int cpu)
-{
- sld_update_msr(true);
-
- return 0;
-}
-
-static void split_lock_warn(unsigned long ip)
-{
- struct delayed_work *work;
- int cpu;
-
- if (!current->reported_split_lock)
- pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
- current->comm, current->pid, ip);
- current->reported_split_lock = 1;
-
- if (sysctl_sld_mitigate) {
- /*
- * misery factor #1:
- * sleep 10ms before trying to execute split lock.
- */
- if (msleep_interruptible(10) > 0)
- return;
- /*
- * Misery factor #2:
- * only allow one buslocked disabled core at a time.
- */
- if (down_interruptible(&buslock_sem) == -EINTR)
- return;
- work = &sl_reenable_unlock;
- } else {
- work = &sl_reenable;
- }
-
- cpu = get_cpu();
- schedule_delayed_work_on(cpu, work, 2);
-
- /* Disable split lock detection on this CPU to make progress */
- sld_update_msr(false);
- put_cpu();
-}
-
-bool handle_guest_split_lock(unsigned long ip)
-{
- if (sld_state == sld_warn) {
- split_lock_warn(ip);
- return true;
- }
-
- pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
- current->comm, current->pid,
- sld_state == sld_fatal ? "fatal" : "bogus", ip);
-
- current->thread.error_code = 0;
- current->thread.trap_nr = X86_TRAP_AC;
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- return false;
-}
-EXPORT_SYMBOL_GPL(handle_guest_split_lock);
-
-static void bus_lock_init(void)
-{
- u64 val;
-
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- return;
-
- rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
-
- if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
- (sld_state == sld_warn || sld_state == sld_fatal)) ||
- sld_state == sld_off) {
- /*
- * Warn and fatal are handled by #AC for split lock if #AC for
- * split lock is supported.
- */
- val &= ~DEBUGCTLMSR_BUS_LOCK_DETECT;
- } else {
- val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
- }
-
- wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
-}
-
-bool handle_user_split_lock(struct pt_regs *regs, long error_code)
-{
- if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
- return false;
- split_lock_warn(regs->ip);
- return true;
-}
-
-void handle_bus_lock(struct pt_regs *regs)
-{
- switch (sld_state) {
- case sld_off:
- break;
- case sld_ratelimit:
- /* Enforce no more than bld_ratelimit bus locks/sec. */
- while (!__ratelimit(&bld_ratelimit))
- msleep(20);
- /* Warn on the bus lock. */
- fallthrough;
- case sld_warn:
- pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
- current->comm, current->pid, regs->ip);
- break;
- case sld_fatal:
- force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
- break;
- }
-}
-
-/*
- * CPU models that are known to have the per-core split-lock detection
- * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
- */
-static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
- X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, 0),
- {}
-};
-
-static void __init split_lock_setup(struct cpuinfo_x86 *c)
-{
- const struct x86_cpu_id *m;
- u64 ia32_core_caps;
-
- if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
- return;
-
- /* Check for CPUs that have support but do not enumerate it: */
- m = x86_match_cpu(split_lock_cpu_ids);
- if (m)
- goto supported;
-
- if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
- return;
-
- /*
- * Not all bits in MSR_IA32_CORE_CAPS are architectural, but
- * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set
- * it have split lock detection.
- */
- rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
- if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
- goto supported;
-
- /* CPU is not in the model list and does not have the MSR bit: */
- return;
-
-supported:
- cpu_model_supports_sld = true;
- __split_lock_setup();
-}
-
-static void sld_state_show(void)
-{
- if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
- return;
-
- switch (sld_state) {
- case sld_off:
- pr_info("disabled\n");
- break;
- case sld_warn:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
- if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
- "x86/splitlock", NULL, splitlock_cpu_offline) < 0)
- pr_warn("No splitlock CPU offline handler\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: warning on user-space bus_locks\n");
- }
- break;
- case sld_fatal:
- if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
- pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
- } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
- pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
- boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
- " from non-WB" : "");
- }
- break;
- case sld_ratelimit:
- if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
- pr_info("#DB: setting system wide bus lock rate limit to %u/sec\n", bld_ratelimit.burst);
- break;
- }
-}
-
-void __init sld_setup(struct cpuinfo_x86 *c)
-{
- split_lock_setup(c);
- sld_state_setup();
- sld_state_show();
-}
-
#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
/**
@@ -1364,3 +907,18 @@ u8 get_this_hybrid_cpu_type(void)
return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
}
+
+/**
+ * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU
+ *
+ * Returns the uarch native ID [23:0] of a CPU in a hybrid processor.
+ * If the processor is not hybrid, returns 0.
+ */
+u32 get_this_hybrid_cpu_native_id(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return 0;
+
+ return cpuid_eax(0x0000001a) &
+ (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1);
+}
diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c
index e4c3ba91321c..30b1d63b97f3 100644
--- a/arch/x86/kernel/cpu/intel_epb.c
+++ b/arch/x86/kernel/cpu/intel_epb.c
@@ -204,12 +204,12 @@ static int intel_epb_offline(unsigned int cpu)
}
static const struct x86_cpu_id intel_epb_normal[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
- X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,
- ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ALDERLAKE_L,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
+ X86_MATCH_VFM(INTEL_RAPTORLAKE_P,
+ ENERGY_PERF_BIAS_NORMAL_POWERSAVE),
{}
};
@@ -237,4 +237,4 @@ err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
-subsys_initcall(intel_epb_init);
+late_initcall(intel_epb_init);
diff --git a/arch/x86/kernel/cpu/intel_pconfig.c b/arch/x86/kernel/cpu/intel_pconfig.c
deleted file mode 100644
index 0771a905b286..000000000000
--- a/arch/x86/kernel/cpu/intel_pconfig.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel PCONFIG instruction support.
- *
- * Copyright (C) 2017 Intel Corporation
- *
- * Author:
- * Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
- */
-
-#include <asm/cpufeature.h>
-#include <asm/intel_pconfig.h>
-
-#define PCONFIG_CPUID 0x1b
-
-#define PCONFIG_CPUID_SUBLEAF_MASK ((1 << 12) - 1)
-
-/* Subleaf type (EAX) for PCONFIG CPUID leaf (0x1B) */
-enum {
- PCONFIG_CPUID_SUBLEAF_INVALID = 0,
- PCONFIG_CPUID_SUBLEAF_TARGETID = 1,
-};
-
-/* Bitmask of supported targets */
-static u64 targets_supported __read_mostly;
-
-int pconfig_target_supported(enum pconfig_target target)
-{
- /*
- * We would need to re-think the implementation once we get > 64
- * PCONFIG targets. Spec allows up to 2^32 targets.
- */
- BUILD_BUG_ON(PCONFIG_TARGET_NR >= 64);
-
- if (WARN_ON_ONCE(target >= 64))
- return 0;
- return targets_supported & (1ULL << target);
-}
-
-static int __init intel_pconfig_init(void)
-{
- int subleaf;
-
- if (!boot_cpu_has(X86_FEATURE_PCONFIG))
- return 0;
-
- /*
- * Scan subleafs of PCONFIG CPUID leaf.
- *
- * Subleafs of the same type need not to be consecutive.
- *
- * Stop on the first invalid subleaf type. All subleafs after the first
- * invalid are invalid too.
- */
- for (subleaf = 0; subleaf < INT_MAX; subleaf++) {
- struct cpuid_regs regs;
-
- cpuid_count(PCONFIG_CPUID, subleaf,
- &regs.eax, &regs.ebx, &regs.ecx, &regs.edx);
-
- switch (regs.eax & PCONFIG_CPUID_SUBLEAF_MASK) {
- case PCONFIG_CPUID_SUBLEAF_INVALID:
- /* Stop on the first invalid subleaf */
- goto out;
- case PCONFIG_CPUID_SUBLEAF_TARGETID:
- /* Mark supported PCONFIG targets */
- if (regs.ebx < 64)
- targets_supported |= (1ULL << regs.ebx);
- if (regs.ecx < 64)
- targets_supported |= (1ULL << regs.ecx);
- if (regs.edx < 64)
- targets_supported |= (1ULL << regs.edx);
- break;
- default:
- /* Unknown CPUID.PCONFIG subleaf: ignore */
- break;
- }
- }
-out:
- return 0;
-}
-arch_initcall(intel_pconfig_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index ad6776081e60..4f3c65429f82 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -6,7 +6,7 @@
#include <linux/slab.h>
/**
- * x86_match_cpu - match current CPU again an array of x86_cpu_ids
+ * x86_match_cpu - match current CPU against an array of x86_cpu_ids
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
* {}.
*
@@ -17,8 +17,7 @@
*
* A typical table entry would be to match a specific CPU
*
- * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
- * X86_FEATURE_ANY, NULL);
+ * X86_MATCH_VFM_FEATURE(INTEL_BROADWELL, X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
* %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
@@ -26,7 +25,7 @@
* asm/cpu_device_id.h contains a set of useful macros which are shortcuts
* for various common selections. The above can be shortened to:
*
- * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
+ * X86_MATCH_VFM(INTEL_BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
@@ -39,9 +38,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
- for (m = match;
- m->vendor | m->family | m->model | m->steppings | m->feature;
- m++) {
+ for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
@@ -59,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
}
EXPORT_SYMBOL(x86_match_cpu);
-static const struct x86_cpu_desc *
-x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
+bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
- const struct x86_cpu_desc *m;
-
- for (m = match; m->x86_family | m->x86_model; m++) {
- if (c->x86_vendor != m->x86_vendor)
- continue;
- if (c->x86 != m->x86_family)
- continue;
- if (c->x86_model != m->x86_model)
- continue;
- if (c->x86_stepping != m->x86_stepping)
- continue;
- return m;
- }
- return NULL;
-}
-
-bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
-{
- const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
+ const struct x86_cpu_id *res = x86_match_cpu(table);
- if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
+ if (!res || res->driver_data > boot_cpu_data.microcode)
return false;
return true;
}
-EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
+EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index f3517b8a8e91..1075a90141da 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -4,8 +4,6 @@
*
* Written by Jacob Shin - AMD, Inc.
* Maintained by: Borislav Petkov <bp@alien8.de>
- *
- * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -20,7 +18,6 @@
#include <linux/smp.h>
#include <linux/string.h>
-#include <asm/amd_nb.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
@@ -87,42 +84,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-static struct smca_bank_name smca_names[] = {
- [SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
- [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
- [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
- [SMCA_DE] = { "decode_unit", "Decode Unit" },
- [SMCA_RESERVED] = { "reserved", "Reserved" },
- [SMCA_EX] = { "execution_unit", "Execution Unit" },
- [SMCA_FP] = { "floating_point", "Floating Point Unit" },
- [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
- [SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
- [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+static const char * const smca_names[] = {
+ [SMCA_LS ... SMCA_LS_V2] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [SMCA_RESERVED] = "reserved",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+ [SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
+ [SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
- [SMCA_UMC] = { "umc", "Unified Memory Controller" },
- [SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
- [SMCA_PB] = { "param_block", "Parameter Block" },
- [SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
- [SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
- [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
- [SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
- [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
- [SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
- [SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
- [SMCA_NBIF] = { "nbif", "NBIF Unit" },
- [SMCA_SHUB] = { "shub", "System Hub Unit" },
- [SMCA_SATA] = { "sata", "SATA Unit" },
- [SMCA_USB] = { "usb", "USB Unit" },
- [SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
- [SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
- [SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
- [SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
+ [SMCA_UMC] = "umc",
+ [SMCA_UMC_V2] = "umc_v2",
+ [SMCA_MA_LLC] = "ma_llc",
+ [SMCA_PB] = "param_block",
+ [SMCA_PSP ... SMCA_PSP_V2] = "psp",
+ [SMCA_SMU ... SMCA_SMU_V2] = "smu",
+ [SMCA_MP5] = "mp5",
+ [SMCA_MPDMA] = "mpdma",
+ [SMCA_NBIO] = "nbio",
+ [SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
+ [SMCA_XGMI_PCS] = "xgmi_pcs",
+ [SMCA_NBIF] = "nbif",
+ [SMCA_SHUB] = "shub",
+ [SMCA_SATA] = "sata",
+ [SMCA_USB] = "usb",
+ [SMCA_USR_DP] = "usr_dp",
+ [SMCA_USR_CP] = "usr_cp",
+ [SMCA_GMI_PCS] = "gmi_pcs",
+ [SMCA_XGMI_PHY] = "xgmi_phy",
+ [SMCA_WAFL_PHY] = "wafl_phy",
+ [SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
@@ -130,17 +125,8 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
- return smca_names[t].name;
-}
-
-const char *smca_get_long_name(enum smca_bank_types t)
-{
- if (t >= N_SMCA_BANK_TYPES)
- return NULL;
-
- return smca_names[t].long_name;
+ return smca_names[t];
}
-EXPORT_SYMBOL_GPL(smca_get_long_name);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
@@ -178,6 +164,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +199,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
@@ -229,6 +218,32 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
#define MAX_MCATYPE_NAME_LEN 30
static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
+struct threshold_block {
+ /* This block's number within its bank. */
+ unsigned int block;
+ /* MCA bank number that contains this block. */
+ unsigned int bank;
+ /* CPU which controls this block's MCA bank. */
+ unsigned int cpu;
+ /* MCA_MISC MSR address for this block. */
+ u32 address;
+ /* Enable/Disable APIC interrupt. */
+ bool interrupt_enable;
+ /* Bank can generate an interrupt. */
+ bool interrupt_capable;
+ /* Value upon which threshold interrupt is generated. */
+ u16 threshold_limit;
+ /* sysfs object */
+ struct kobject kobj;
+ /* List of threshold blocks within this block's MCA bank. */
+ struct list_head miscj;
+};
+
+struct threshold_bank {
+ struct kobject *kobj;
+ struct threshold_block *blocks;
+};
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
/*
@@ -341,19 +356,6 @@ struct thresh_restart {
u16 old_limit;
};
-static inline bool is_shared_bank(int bank)
-{
- /*
- * Scalable MCA provides for only one core to have access to the MSRs of
- * a shared bank.
- */
- if (mce_flags.smca)
- return false;
-
- /* Bank 4 is for northbridge reporting and is thus shared */
- return (bank == 4);
-}
-
static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
@@ -389,7 +391,7 @@ static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
return msr_high_bits & BIT(28);
}
-static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+static bool lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
{
int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -397,7 +399,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
if (apic != msr) {
@@ -407,15 +409,15 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
* was set is reserved. Return early here:
*/
if (mce_flags.smca)
- return 0;
+ return false;
pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
"for bank %d, block %d (MSR%08X=0x%x%08x)\n",
b->cpu, apic, b->bank, b->block, b->address, hi, lo);
- return 0;
+ return false;
}
- return 1;
+ return true;
};
/* Reprogram MCx_MISC MSR behind this threshold bank. */
@@ -786,29 +788,33 @@ bool amd_mce_usable_address(struct mce *m)
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
{
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m = &err.m;
- mce_setup(&m);
+ mce_prep_record(&err);
- m.status = status;
- m.misc = misc;
- m.bank = bank;
- m.tsc = rdtsc();
+ m->status = status;
+ m->misc = misc;
+ m->bank = bank;
+ m->tsc = rdtsc();
- if (m.status & MCI_STATUS_ADDRV) {
- m.addr = addr;
+ if (m->status & MCI_STATUS_ADDRV) {
+ m->addr = addr;
- smca_extract_err_addr(&m);
+ smca_extract_err_addr(m);
}
if (mce_flags.smca) {
- rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid);
- if (m.status & MCI_STATUS_SYNDV)
- rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ if (m->status & MCI_STATUS_SYNDV) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1);
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2);
+ }
}
- mce_log(&m);
+ mce_log(&err);
}
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
@@ -1202,35 +1208,10 @@ out_free:
return err;
}
-static int __threshold_add_blocks(struct threshold_bank *b)
-{
- struct list_head *head = &b->blocks->miscj;
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
- int err = 0;
-
- err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
- if (err)
- return err;
-
- list_for_each_entry_safe(pos, tmp, head, miscj) {
-
- err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
- if (err) {
- list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
- kobject_del(&pos->kobj);
-
- return err;
- }
- }
- return err;
-}
-
static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
unsigned int bank)
{
struct device *dev = this_cpu_read(mce_device);
- struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
const char *name = get_name(cpu, bank, NULL);
int err = 0;
@@ -1238,26 +1219,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
if (!dev)
return -ENODEV;
- if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(topology_die_id(cpu));
-
- /* threshold descriptor already initialized on this node? */
- if (nb && nb->bank4) {
- /* yes, use it */
- b = nb->bank4;
- err = kobject_add(b->kobj, &dev->kobj, name);
- if (err)
- goto out;
-
- bp[bank] = b;
- refcount_inc(&b->cpus);
-
- err = __threshold_add_blocks(b);
-
- goto out;
- }
- }
-
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
@@ -1271,17 +1232,6 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
goto out_free;
}
- if (is_shared_bank(bank)) {
- b->shared = 1;
- refcount_set(&b->cpus, 1);
-
- /* nb is already initialized, see above */
- if (nb) {
- WARN_ON(nb->bank4);
- nb->bank4 = b;
- }
- }
-
err = allocate_threshold_blocks(cpu, b, bank, 0, mca_msr_reg(bank, MCA_MISC));
if (err)
goto out_kobj;
@@ -1314,40 +1264,11 @@ static void deallocate_threshold_blocks(struct threshold_bank *bank)
kobject_put(&bank->blocks->kobj);
}
-static void __threshold_remove_blocks(struct threshold_bank *b)
-{
- struct threshold_block *pos = NULL;
- struct threshold_block *tmp = NULL;
-
- kobject_put(b->kobj);
-
- list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
- kobject_put(b->kobj);
-}
-
static void threshold_remove_bank(struct threshold_bank *bank)
{
- struct amd_northbridge *nb;
-
if (!bank->blocks)
goto out_free;
- if (!bank->shared)
- goto out_dealloc;
-
- if (!refcount_dec_and_test(&bank->cpus)) {
- __threshold_remove_blocks(bank);
- return;
- } else {
- /*
- * The last CPU on this node using the shared bank is going
- * away, remove that bank now.
- */
- nb = node_to_amd_nb(topology_die_id(smp_processor_id()));
- nb->bank4 = NULL;
- }
-
-out_dealloc:
deallocate_threshold_blocks(bank);
out_free:
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 7f7309ff67d0..0a89947e47bc 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -28,7 +28,8 @@
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
int lsb;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
@@ -44,30 +45,33 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
else
lsb = PAGE_SHIFT;
- mce_setup(&m);
- m.bank = -1;
+ mce_prep_record(&err);
+ m = &err.m;
+ m->bank = -1;
/* Fake a memory read error with unknown channel */
- m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
- m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
+ m->status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
+ m->misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
if (severity >= GHES_SEV_RECOVERABLE)
- m.status |= MCI_STATUS_UC;
+ m->status |= MCI_STATUS_UC;
if (severity >= GHES_SEV_PANIC) {
- m.status |= MCI_STATUS_PCC;
- m.tsc = rdtsc();
+ m->status |= MCI_STATUS_PCC;
+ m->tsc = rdtsc();
}
- m.addr = mem_err->physical_addr;
- mce_log(&m);
+ m->addr = mem_err->physical_addr;
+ mce_log(&err);
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
{
const u64 *i_mce = ((const u64 *) (ctx_info + 1));
- unsigned int cpu;
- struct mce m;
+ unsigned int cpu, num_regs;
+ bool apicid_found = false;
+ struct mce_hw_err err;
+ struct mce *m;
if (!boot_cpu_has(X86_FEATURE_SMCA))
return -EINVAL;
@@ -85,41 +89,86 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
return -EINVAL;
/*
- * The register array size must be large enough to include all the
- * SMCA registers which need to be extracted.
- *
* The number of registers in the register array is determined by
* Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
- * The register layout is fixed and currently the raw data in the
- * register array includes 6 SMCA registers which the kernel can
- * extract.
+ * Sanity-check registers array size.
*/
- if (ctx_info->reg_arr_size < 48)
+ num_regs = ctx_info->reg_arr_size >> 3;
+ if (!num_regs)
return -EINVAL;
- mce_setup(&m);
-
- m.extcpu = -1;
- m.socketid = -1;
-
for_each_possible_cpu(cpu) {
if (cpu_data(cpu).topo.initial_apicid == lapic_id) {
- m.extcpu = cpu;
- m.socketid = cpu_data(m.extcpu).topo.pkg_id;
+ apicid_found = true;
break;
}
}
- m.apicid = lapic_id;
- m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
- m.status = *i_mce;
- m.addr = *(i_mce + 1);
- m.misc = *(i_mce + 2);
- /* Skipping MCA_CONFIG */
- m.ipid = *(i_mce + 4);
- m.synd = *(i_mce + 5);
+ if (!apicid_found)
+ return -EINVAL;
+
+ m = &err.m;
+ memset(&err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(cpu, m);
+
+ m->bank = (ctx_info->msr_addr >> 4) & 0xFF;
+
+ /*
+ * The SMCA register layout is fixed and includes 16 registers.
+ * The end of the array may be variable, but the beginning is known.
+ * Cap the number of registers to expected max (15).
+ */
+ if (num_regs > 15)
+ num_regs = 15;
+
+ switch (num_regs) {
+ /* MCA_SYND2 */
+ case 15:
+ err.vendor.amd.synd2 = *(i_mce + 14);
+ fallthrough;
+ /* MCA_SYND1 */
+ case 14:
+ err.vendor.amd.synd1 = *(i_mce + 13);
+ fallthrough;
+ /* MCA_MISC4 */
+ case 13:
+ /* MCA_MISC3 */
+ case 12:
+ /* MCA_MISC2 */
+ case 11:
+ /* MCA_MISC1 */
+ case 10:
+ /* MCA_DEADDR */
+ case 9:
+ /* MCA_DESTAT */
+ case 8:
+ /* reserved */
+ case 7:
+ /* MCA_SYND */
+ case 6:
+ m->synd = *(i_mce + 5);
+ fallthrough;
+ /* MCA_IPID */
+ case 5:
+ m->ipid = *(i_mce + 4);
+ fallthrough;
+ /* MCA_CONFIG */
+ case 4:
+ /* MCA_MISC0 */
+ case 3:
+ m->misc = *(i_mce + 2);
+ fallthrough;
+ /* MCA_ADDR */
+ case 2:
+ m->addr = *(i_mce + 1);
+ fallthrough;
+ /* MCA_STATUS */
+ case 1:
+ m->status = *i_mce;
+ }
- mce_log(&m);
+ mce_log(&err);
return 0;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7b397370b4d6..0dc00c9894c7 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -44,14 +44,17 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
+#include <linux/kexec.h>
-#include <asm/intel-family.h>
+#include <asm/fred.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/reboot.h>
+#include <asm/tdx.h>
#include "internal.h"
@@ -85,7 +88,7 @@ struct mca_config mca_cfg __read_mostly = {
.monarch_timeout = -1
};
-static DEFINE_PER_CPU(struct mce, mces_seen);
+static DEFINE_PER_CPU(struct mce_hw_err, hw_errs_seen);
static unsigned long mce_need_notify;
/*
@@ -114,28 +117,41 @@ static struct irq_work mce_irq_work;
*/
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
-/* Do initial initialization of a struct mce */
-void mce_setup(struct mce *m)
+void mce_prep_record_common(struct mce *m)
{
- memset(m, 0, sizeof(struct mce));
- m->cpu = m->extcpu = smp_processor_id();
+ m->cpuid = cpuid_eax(1);
+ m->cpuvendor = boot_cpu_data.x86_vendor;
+ m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
/* need the internal __ version to avoid deadlocks */
- m->time = __ktime_get_real_seconds();
- m->cpuvendor = boot_cpu_data.x86_vendor;
- m->cpuid = cpuid_eax(1);
- m->socketid = cpu_data(m->extcpu).topo.pkg_id;
- m->apicid = cpu_data(m->extcpu).topo.initial_apicid;
- m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
- m->ppin = cpu_data(m->extcpu).ppin;
- m->microcode = boot_cpu_data.microcode;
+ m->time = __ktime_get_real_seconds();
+}
+
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m)
+{
+ m->cpu = cpu;
+ m->extcpu = cpu;
+ m->apicid = cpu_data(cpu).topo.initial_apicid;
+ m->microcode = cpu_data(cpu).microcode;
+ m->ppin = topology_ppin(cpu);
+ m->socketid = topology_physical_package_id(cpu);
+}
+
+/* Do initial initialization of struct mce_hw_err */
+void mce_prep_record(struct mce_hw_err *err)
+{
+ struct mce *m = &err->m;
+
+ memset(err, 0, sizeof(struct mce_hw_err));
+ mce_prep_record_common(m);
+ mce_prep_record_per_cpu(smp_processor_id(), m);
}
DEFINE_PER_CPU(struct mce, injectm);
EXPORT_PER_CPU_SYMBOL_GPL(injectm);
-void mce_log(struct mce *m)
+void mce_log(struct mce_hw_err *err)
{
- if (!mce_gen_pool_add(m))
+ if (mce_gen_pool_add(err))
irq_work_queue(&mce_irq_work);
}
EXPORT_SYMBOL_GPL(mce_log);
@@ -156,8 +172,10 @@ void mce_unregister_decode_chain(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
-static void __print_mce(struct mce *m)
+static void __print_mce(struct mce_hw_err *err)
{
+ struct mce *m = &err->m;
+
pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
m->extcpu,
(m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
@@ -184,6 +202,10 @@ static void __print_mce(struct mce *m)
if (mce_flags.smca) {
if (m->synd)
pr_cont("SYND %llx ", m->synd);
+ if (err->vendor.amd.synd1)
+ pr_cont("SYND1 %llx ", err->vendor.amd.synd1);
+ if (err->vendor.amd.synd2)
+ pr_cont("SYND2 %llx ", err->vendor.amd.synd2);
if (m->ipid)
pr_cont("IPID %llx ", m->ipid);
}
@@ -199,9 +221,11 @@ static void __print_mce(struct mce *m)
m->microcode);
}
-static void print_mce(struct mce *m)
+static void print_mce(struct mce_hw_err *err)
{
- __print_mce(m);
+ struct mce *m = &err->m;
+
+ __print_mce(err);
if (m->cpuvendor != X86_VENDOR_AMD && m->cpuvendor != X86_VENDOR_HYGON)
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
@@ -228,11 +252,20 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
+static const char *mce_dump_aux_info(struct mce *m)
+{
+ if (boot_cpu_has_bug(X86_BUG_TDX_PW_MCE))
+ return tdx_dump_mce_info(m);
+
+ return NULL;
+}
+
+static noinstr void mce_panic(const char *msg, struct mce_hw_err *final, char *exp)
{
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
+ const char *memmsg;
/*
* Allow instrumentation around external facilities usage. Not that it
@@ -258,20 +291,22 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
pending = mce_gen_pool_prepare_records();
/* First print corrected ones that are still unlogged */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC)) {
- print_mce(m);
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
}
/* Now print uncorrected but with the final one last */
llist_for_each_entry(l, pending, llnode) {
- struct mce *m = &l->mce;
+ struct mce_hw_err *err = &l->err;
+ struct mce *m = &err->m;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || mce_cmp(m, final)) {
- print_mce(m);
+ if (!final || mce_cmp(m, &final->m)) {
+ print_mce(err);
if (!apei_err)
apei_err = apei_write_mce(m);
}
@@ -279,13 +314,33 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (final) {
print_mce(final);
if (!apei_err)
- apei_err = apei_write_mce(final);
+ apei_err = apei_write_mce(&final->m);
}
if (exp)
pr_emerg(HW_ERR "Machine check: %s\n", exp);
+
+ memmsg = mce_dump_aux_info(&final->m);
+ if (memmsg)
+ pr_emerg(HW_ERR "Machine check: %s\n", memmsg);
+
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
+
+ /*
+ * Kdump skips the poisoned page in order to avoid
+ * touching the error bits again. Poison the page even
+ * if the error is fatal and the machine is about to
+ * panic.
+ */
+ if (kexec_crash_loaded()) {
+ if (final && (final->m.status & MCI_STATUS_ADDRV)) {
+ struct page *p;
+ p = pfn_to_online_page(final->m.addr >> PAGE_SHIFT);
+ if (p)
+ SetPageHWPoison(p);
+ }
+ }
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -401,16 +456,18 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v)
* check into our "mce" struct so that we can use it later to assess
* the severity of the problem as we read per-bank specific details.
*/
-static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
+static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs)
{
+ struct mce *m;
/*
- * Enable instrumentation around mce_setup() which calls external
+ * Enable instrumentation around mce_prep_record() which calls external
* facilities.
*/
instrumentation_begin();
- mce_setup(m);
+ mce_prep_record(err);
instrumentation_end();
+ m = &err->m;
m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
if (regs) {
/*
@@ -435,10 +492,10 @@ static noinstr void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
}
-int mce_available(struct cpuinfo_x86 *c)
+bool mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
- return 0;
+ return false;
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
@@ -530,13 +587,13 @@ EXPORT_SYMBOL_GPL(mce_is_correctable);
static int mce_early_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
/* Emit the trace record: */
- trace_mce_record(m);
+ trace_mce_record(err);
set_bit(0, &mce_need_notify);
@@ -580,13 +637,13 @@ static struct notifier_block mce_uc_nb = {
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
- struct mce *m = (struct mce *)data;
+ struct mce_hw_err *err = to_mce_hw_err(data);
- if (!m)
+ if (!err)
return NOTIFY_DONE;
- if (mca_cfg.print_all || !m->kflags)
- __print_mce(m);
+ if (mca_cfg.print_all || !(err->m.kflags))
+ __print_mce(err);
return NOTIFY_DONE;
}
@@ -600,8 +657,10 @@ static struct notifier_block mce_default_nb = {
/*
* Read ADDR and MISC registers.
*/
-static noinstr void mce_read_aux(struct mce *m, int i)
+static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
{
+ struct mce *m = &err->m;
+
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC));
@@ -623,8 +682,11 @@ static noinstr void mce_read_aux(struct mce *m, int i)
if (mce_flags.smca) {
m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
- if (m->status & MCI_STATUS_SYNDV)
+ if (m->status & MCI_STATUS_SYNDV) {
m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
+ err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i));
+ err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i));
+ }
}
}
@@ -645,40 +707,51 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- bool error_seen = false;
- struct mce m;
+ struct mce_hw_err err;
+ struct mce *m;
int i;
this_cpu_inc(mce_poll_count);
- mce_gather_info(&m, NULL);
+ mce_gather_info(&err, NULL);
+ m = &err.m;
if (flags & MCP_TIMESTAMP)
- m.tsc = rdtsc();
+ m->tsc = rdtsc();
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
- m.misc = 0;
- m.addr = 0;
- m.bank = i;
+ m->misc = 0;
+ m->addr = 0;
+ m->bank = i;
barrier();
- m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+ m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
+
+ /*
+ * Update storm tracking here, before checking for the
+ * MCI_STATUS_VAL bit. Valid corrected errors count
+ * towards declaring, or maintaining, storm status. No
+ * error in a bank counts towards avoiding, or ending,
+ * storm status.
+ */
+ if (!mca_cfg.cmci_disabled)
+ mce_track_storm(m);
/* If this entry is not valid, ignore it */
- if (!(m.status & MCI_STATUS_VAL))
+ if (!(m->status & MCI_STATUS_VAL))
continue;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
- if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC))
+ if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
goto log_it;
/*
@@ -688,20 +761,20 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* everything else.
*/
if (!mca_cfg.ser) {
- if (m.status & MCI_STATUS_UC)
+ if (m->status & MCI_STATUS_UC)
continue;
goto log_it;
}
/* Log "not enabled" (speculative) errors */
- if (!(m.status & MCI_STATUS_EN))
+ if (!(m->status & MCI_STATUS_EN))
goto log_it;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
- if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S))
+ if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
goto log_it;
/*
@@ -712,25 +785,23 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
continue;
log_it:
- error_seen = true;
-
if (flags & MCP_DONTLOG)
goto clear_it;
- mce_read_aux(&m, i);
- m.severity = mce_severity(&m, NULL, NULL, false);
+ mce_read_aux(&err, i);
+ m->severity = mce_severity(m, NULL, NULL, false);
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ if (mca_cfg.dont_log_ce && !mce_usable_address(m))
goto clear_it;
if (flags & MCP_QUEUE_LOG)
- mce_gen_pool_add(&m);
+ mce_gen_pool_add(&err);
else
- mce_log(&m);
+ mce_log(&err);
clear_it:
/*
@@ -745,8 +816,6 @@ clear_it:
*/
sync_core();
-
- return error_seen;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -856,9 +925,10 @@ static __always_inline void quirk_zen_ifu(int bank, struct mce *m, struct pt_reg
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
-static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
+static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
+ struct mce *m = &err->m;
char *tmp = *msg;
int i;
@@ -876,7 +946,7 @@ static __always_inline int mce_no_way_out(struct mce *m, char **msg, unsigned lo
m->bank = i;
if (mce_severity(m, regs, &tmp, true) >= MCE_PANIC_SEVERITY) {
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
*msg = tmp;
return 1;
}
@@ -967,10 +1037,11 @@ out:
*/
static void mce_reign(void)
{
- int cpu;
+ struct mce_hw_err *err = NULL;
struct mce *m = NULL;
int global_worst = 0;
char *msg = NULL;
+ int cpu;
/*
* This CPU is the Monarch and the other CPUs have run
@@ -978,11 +1049,13 @@ static void mce_reign(void)
* Grade the severity of the errors of all the CPUs.
*/
for_each_possible_cpu(cpu) {
- struct mce *mtmp = &per_cpu(mces_seen, cpu);
+ struct mce_hw_err *etmp = &per_cpu(hw_errs_seen, cpu);
+ struct mce *mtmp = &etmp->m;
if (mtmp->severity > global_worst) {
global_worst = mtmp->severity;
- m = &per_cpu(mces_seen, cpu);
+ err = &per_cpu(hw_errs_seen, cpu);
+ m = &err->m;
}
}
@@ -994,7 +1067,7 @@ static void mce_reign(void)
if (m && global_worst >= MCE_PANIC_SEVERITY) {
/* call mce_severity() to get "msg" for panic */
mce_severity(m, NULL, &msg, true);
- mce_panic("Fatal machine check", m, msg);
+ mce_panic("Fatal machine check", err, msg);
}
/*
@@ -1011,11 +1084,11 @@ static void mce_reign(void)
mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
- * Now clear all the mces_seen so that they don't reappear on
+ * Now clear all the hw_errs_seen so that they don't reappear on
* the next mce.
*/
for_each_possible_cpu(cpu)
- memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
+ memset(&per_cpu(hw_errs_seen, cpu), 0, sizeof(struct mce_hw_err));
}
static atomic_t global_nwo;
@@ -1219,13 +1292,14 @@ static noinstr bool mce_check_crashing_cpu(void)
}
static __always_inline int
-__mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
- unsigned long *toclear, unsigned long *valid_banks, int no_way_out,
- int *worst)
+__mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs,
+ struct mce_hw_err *final, unsigned long *toclear,
+ unsigned long *valid_banks, int no_way_out, int *worst)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
struct mca_config *cfg = &mca_cfg;
int severity, i, taint = 0;
+ struct mce *m = &err->m;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
arch___clear_bit(i, toclear);
@@ -1270,7 +1344,7 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
if (severity == MCE_NO_SEVERITY)
continue;
- mce_read_aux(m, i);
+ mce_read_aux(err, i);
/* assuming valid severity level != 0 */
m->severity = severity;
@@ -1280,17 +1354,17 @@ __mc_scan_banks(struct mce *m, struct pt_regs *regs, struct mce *final,
* done in #MC context, where instrumentation is disabled.
*/
instrumentation_begin();
- mce_log(m);
+ mce_log(err);
instrumentation_end();
if (severity > *worst) {
- *final = *m;
+ *final = *err;
*worst = severity;
}
}
/* mce_clear_state will clear *final, save locally for use later */
- *m = *final;
+ *err = *final;
return taint;
}
@@ -1350,9 +1424,10 @@ static void kill_me_never(struct callback_head *cb)
set_mce_nospec(pfn);
}
-static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callback_head *))
+static void queue_task_work(struct mce_hw_err *err, char *msg, void (*func)(struct callback_head *))
{
int count = ++current->mce_count;
+ struct mce *m = &err->m;
/* First call, save all the details */
if (count == 1) {
@@ -1365,11 +1440,12 @@ static void queue_task_work(struct mce *m, char *msg, void (*func)(struct callba
/* Ten is likely overkill. Don't expect more than two faults before task_work() */
if (count > 10)
- mce_panic("Too many consecutive machine checks while accessing user data", m, msg);
+ mce_panic("Too many consecutive machine checks while accessing user data",
+ err, msg);
/* Second or later call, make sure page address matches the one from first call */
if (count > 1 && (current->mce_addr >> PAGE_SHIFT) != (m->addr >> PAGE_SHIFT))
- mce_panic("Consecutive machine checks to different user pages", m, msg);
+ mce_panic("Consecutive machine checks to different user pages", err, msg);
/* Do not call task_work_add() more than once */
if (count > 1)
@@ -1418,8 +1494,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
int worst = 0, order, no_way_out, kill_current_task, lmce, taint = 0;
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS) = { 0 };
DECLARE_BITMAP(toclear, MAX_NR_BANKS) = { 0 };
- struct mce m, *final;
+ struct mce_hw_err *final;
+ struct mce_hw_err err;
char *msg = NULL;
+ struct mce *m;
if (unlikely(mce_flags.p5))
return pentium_machine_check(regs);
@@ -1457,13 +1535,14 @@ noinstr void do_machine_check(struct pt_regs *regs)
this_cpu_inc(mce_exception_count);
- mce_gather_info(&m, regs);
- m.tsc = rdtsc();
+ mce_gather_info(&err, regs);
+ m = &err.m;
+ m->tsc = rdtsc();
- final = this_cpu_ptr(&mces_seen);
- *final = m;
+ final = this_cpu_ptr(&hw_errs_seen);
+ *final = err;
- no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
+ no_way_out = mce_no_way_out(&err, &msg, valid_banks, regs);
barrier();
@@ -1472,15 +1551,15 @@ noinstr void do_machine_check(struct pt_regs *regs)
* Assume the worst for now, but if we find the
* severity is MCE_AR_SEVERITY we have other options.
*/
- if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ if (!(m->mcgstatus & MCG_STATUS_RIPV))
kill_current_task = 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL ||
- m.cpuvendor == X86_VENDOR_ZHAOXIN)
- lmce = m.mcgstatus & MCG_STATUS_LMCES;
+ if (m->cpuvendor == X86_VENDOR_INTEL ||
+ m->cpuvendor == X86_VENDOR_ZHAOXIN)
+ lmce = m->mcgstatus & MCG_STATUS_LMCES;
/*
* Local machine check may already know that we have to panic.
@@ -1491,12 +1570,12 @@ noinstr void do_machine_check(struct pt_regs *regs)
*/
if (lmce) {
if (no_way_out)
- mce_panic("Fatal local machine check", &m, msg);
+ mce_panic("Fatal local machine check", &err, msg);
} else {
order = mce_start(&no_way_out);
}
- taint = __mc_scan_banks(&m, regs, final, toclear, valid_banks, no_way_out, &worst);
+ taint = __mc_scan_banks(&err, regs, final, toclear, valid_banks, no_way_out, &worst);
if (!no_way_out)
mce_clear_state(toclear);
@@ -1511,7 +1590,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
no_way_out = worst >= MCE_PANIC_SEVERITY;
if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
+ mce_panic("Fatal machine check on current CPU", &err, msg);
}
} else {
/*
@@ -1523,8 +1602,8 @@ noinstr void do_machine_check(struct pt_regs *regs)
* make sure we have the right "msg".
*/
if (worst >= MCE_PANIC_SEVERITY) {
- mce_severity(&m, regs, &msg, true);
- mce_panic("Local fatal machine check!", &m, msg);
+ mce_severity(m, regs, &msg, true);
+ mce_panic("Local fatal machine check!", &err, msg);
}
}
@@ -1542,15 +1621,33 @@ noinstr void do_machine_check(struct pt_regs *regs)
goto out;
/* Fault was in user mode and we need to take some action */
- if ((m.cs & 3) == 3) {
+ if ((m->cs & 3) == 3) {
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- if (!mce_usable_address(&m))
- queue_task_work(&m, msg, kill_me_now);
+ if (!mce_usable_address(m))
+ queue_task_work(&err, msg, kill_me_now);
else
- queue_task_work(&m, msg, kill_me_maybe);
+ queue_task_work(&err, msg, kill_me_maybe);
+ } else if (m->mcgstatus & MCG_STATUS_SEAM_NR) {
+ /*
+ * Saved RIP on stack makes it look like the machine check
+ * was taken in the kernel on the instruction following
+ * the entry to SEAM mode. But MCG_STATUS_SEAM_NR indicates
+ * that the machine check was taken inside SEAM non-root
+ * mode. CPU core has already marked that guest as dead.
+ * It is OK for the kernel to resume execution at the
+ * apparent point of the machine check as the fault did
+ * not occur there. Mark the page as poisoned so it won't
+ * be added to free list when the guest is terminated.
+ */
+ if (mce_usable_address(m)) {
+ struct page *p = pfn_to_online_page(m->addr >> PAGE_SHIFT);
+
+ if (p)
+ SetPageHWPoison(p);
+ }
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1561,13 +1658,13 @@ noinstr void do_machine_check(struct pt_regs *regs)
* corresponding exception handler which would do that is the
* proper one.
*/
- if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (m->kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ mce_panic("Failed kernel mode recovery", &err, msg);
}
- if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, msg, kill_me_never);
+ if (m->kflags & MCE_IN_KERNEL_COPYIN)
+ queue_task_work(&err, msg, kill_me_never);
}
out:
@@ -1601,13 +1698,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
- return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
@@ -1637,15 +1727,9 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
- if (mce_available(this_cpu_ptr(&cpu_info))) {
+ if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
- if (mce_intel_cmci_poll()) {
- iv = mce_adjust_timer(iv);
- goto done;
- }
- }
-
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
@@ -1655,23 +1739,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-done:
- __this_cpu_write(mce_next_interval, iv);
- __start_timer(t, iv);
+ if (mce_get_storm_mode()) {
+ __start_timer(t, HZ);
+ } else {
+ __this_cpu_write(mce_next_interval, iv);
+ __start_timer(t, iv);
+ }
}
/*
- * Ensure that the timer is firing in @interval from now.
+ * When a storm starts on any bank on this CPU, switch to polling
+ * once per second. When the storm ends, revert to the default
+ * polling interval.
*/
-void mce_timer_kick(unsigned long interval)
+void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long iv = __this_cpu_read(mce_next_interval);
- __start_timer(t, interval);
+ mce_set_storm_mode(storm);
- if (interval < iv)
- __this_cpu_write(mce_next_interval, interval);
+ if (storm)
+ __start_timer(t, HZ);
+ else
+ __this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1688,7 +1778,7 @@ static void mce_timer_delete_all(void)
* Can be called from interrupt context, but not from machine check/NMI
* context.
*/
-int mce_notify_irq(void)
+bool mce_notify_irq(void)
{
/* Not more than two messages every minute */
static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
@@ -1699,9 +1789,9 @@ int mce_notify_irq(void)
if (__ratelimit(&ratelimit))
pr_info(HW_ERR "Machine check events logged\n");
- return 1;
+ return true;
}
- return 0;
+ return false;
}
EXPORT_SYMBOL_GPL(mce_notify_irq);
@@ -1820,101 +1910,120 @@ static void __mcheck_cpu_check_banks(void)
}
}
-/* Add per CPU specific workarounds here */
-static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+static void apply_quirks_amd(struct cpuinfo_x86 *c)
{
struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- struct mca_config *cfg = &mca_cfg;
-
- if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- pr_info("unknown CPU type - not enabling MCE support\n");
- return -EOPNOTSUPP;
- }
/* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD) {
- if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
- /*
- * disable GART TBL walk error reporting, which
- * trips off incorrectly with the IOMMU & 3ware
- * & Cerberus:
- */
- clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
- }
- if (c->x86 < 0x11 && cfg->bootlog < 0) {
- /*
- * Lots of broken BIOS around that don't clear them
- * by default and leave crap in there. Don't log:
- */
- cfg->bootlog = 0;
- }
+ if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) {
/*
- * Various K7s with broken bank 0 around. Always disable
- * by default.
+ * disable GART TBL walk error reporting, which
+ * trips off incorrectly with the IOMMU & 3ware
+ * & Cerberus:
*/
- if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].ctl = 0;
+ clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
+ }
+ if (c->x86 < 0x11 && mca_cfg.bootlog < 0) {
/*
- * overflow_recov is supported for F15h Models 00h-0fh
- * even though we don't have a CPUID bit for it.
+ * Lots of broken BIOS around that don't clear them
+ * by default and leave crap in there. Don't log:
*/
- if (c->x86 == 0x15 && c->x86_model <= 0xf)
- mce_flags.overflow_recov = 1;
+ mca_cfg.bootlog = 0;
+ }
- if (c->x86 >= 0x17 && c->x86 <= 0x1A)
- mce_flags.zen_ifu_quirk = 1;
+ /*
+ * Various K7s with broken bank 0 around. Always disable
+ * by default.
+ */
+ if (c->x86 == 6 && this_cpu_read(mce_num_banks))
+ mce_banks[0].ctl = 0;
- }
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
- if (c->x86_vendor == X86_VENDOR_INTEL) {
- /*
- * SDM documents that on family 6 bank 0 should not be written
- * because it aliases to another special BIOS controlled
- * register.
- * But it's not aliased anymore on model 0x1a+
- * Don't ignore bank 0 completely because there could be a
- * valid event later, merely don't write CTL0.
- */
+ if (c->x86 >= 0x17 && c->x86 <= 0x1A)
+ mce_flags.zen_ifu_quirk = 1;
+}
- if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].init = false;
+static void apply_quirks_intel(struct cpuinfo_x86 *c)
+{
+ struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array);
- /*
- * All newer Intel systems support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
- cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
+ /* Older CPUs (prior to family 6) don't need quirks. */
+ if (c->x86_vfm < INTEL_PENTIUM_PRO)
+ return;
- /*
- * There are also broken BIOSes on some Pentium M and
- * earlier systems:
- */
- if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
- cfg->bootlog = 0;
+ /*
+ * SDM documents that on family 6 bank 0 should not be written
+ * because it aliases to another special BIOS controlled
+ * register.
+ * But it's not aliased anymore on model 0x1a+
+ * Don't ignore bank 0 completely because there could be a
+ * valid event later, merely don't write CTL0.
+ */
+ if (c->x86_vfm < INTEL_NEHALEM_EP && this_cpu_read(mce_num_banks))
+ mce_banks[0].init = false;
- if (c->x86 == 6 && c->x86_model == 45)
- mce_flags.snb_ifu_quirk = 1;
+ /*
+ * All newer Intel systems support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86_vfm >= INTEL_CORE_YONAH && mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
- /*
- * Skylake, Cascacde Lake and Cooper Lake require a quirk on
- * rep movs.
- */
- if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
- mce_flags.skx_repmov_quirk = 1;
+ /*
+ * There are also broken BIOSes on some Pentium M and
+ * earlier systems:
+ */
+ if (c->x86_vfm < INTEL_CORE_YONAH && mca_cfg.bootlog < 0)
+ mca_cfg.bootlog = 0;
+
+ if (c->x86_vfm == INTEL_SANDYBRIDGE_X)
+ mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86_vfm == INTEL_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
+}
+
+static void apply_quirks_zhaoxin(struct cpuinfo_x86 *c)
+{
+ /*
+ * All newer Zhaoxin CPUs support MCE broadcasting. Enable
+ * synchronization with a one second timeout.
+ */
+ if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
+ if (mca_cfg.monarch_timeout < 0)
+ mca_cfg.monarch_timeout = USEC_PER_SEC;
}
+}
- if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
- /*
- * All newer Zhaoxin CPUs support MCE broadcasting. Enable
- * synchronization with a one second timeout.
- */
- if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) {
- if (cfg->monarch_timeout < 0)
- cfg->monarch_timeout = USEC_PER_SEC;
- }
+/* Add per CPU specific workarounds here */
+static bool __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
+{
+ struct mca_config *cfg = &mca_cfg;
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_UNKNOWN:
+ pr_info("unknown CPU type - not enabling MCE support\n");
+ return false;
+ case X86_VENDOR_AMD:
+ apply_quirks_amd(c);
+ break;
+ case X86_VENDOR_INTEL:
+ apply_quirks_intel(c);
+ break;
+ case X86_VENDOR_ZHAOXIN:
+ apply_quirks_zhaoxin(c);
+ break;
}
if (cfg->monarch_timeout < 0)
@@ -1922,28 +2031,28 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (cfg->bootlog != 0)
cfg->panic_timeout = 30;
- return 0;
+ return true;
}
-static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static bool __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
{
if (c->x86 != 5)
- return 0;
+ return false;
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
mce_flags.p5 = 1;
- return 1;
+ return true;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
mce_flags.winchip = 1;
- return 1;
+ return true;
default:
- return 0;
+ return false;
}
- return 0;
+ return false;
}
/*
@@ -1995,7 +2104,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
- mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -2008,16 +2116,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = cmci_intel_adjust_timer;
- break;
-
- case X86_VENDOR_AMD: {
- mce_amd_feature_init(c);
break;
- }
+ case X86_VENDOR_AMD:
case X86_VENDOR_HYGON:
- mce_hygon_feature_init(c);
+ mce_amd_feature_init(c);
break;
case X86_VENDOR_CENTAUR:
@@ -2134,6 +2237,31 @@ DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
exc_machine_check_user(regs);
local_db_restore(dr7);
}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #MCE needs to be handled on different stack: User #MCE
+ * on current task stack, while kernel #MCE on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #MCE dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED machine check entry
+ * stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_MCE(exc_machine_check)
+{
+ unsigned long dr7;
+
+ dr7 = local_db_save();
+ if (user_mode(regs))
+ exc_machine_check_user(regs);
+ else
+ exc_machine_check_kernel(regs);
+ local_db_restore(dr7);
+}
+#endif
#else
/* 32bit unified entry point */
DEFINE_IDTENTRY_RAW(exc_machine_check)
@@ -2166,12 +2294,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_cap_init();
- if (__mcheck_cpu_apply_quirks(c) < 0) {
+ if (!__mcheck_cpu_apply_quirks(c)) {
mca_cfg.disabled = 1;
return;
}
- if (mce_gen_pool_init()) {
+ if (!mce_gen_pool_init()) {
mca_cfg.disabled = 1;
pr_emerg("Couldn't allocate MCE records pool!\n");
return;
@@ -2399,7 +2527,7 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct bus_type mce_subsys = {
+static const struct bus_type mce_subsys = {
.name = "machinecheck",
.dev_name = "machinecheck",
};
@@ -2442,12 +2570,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
return -EINVAL;
b = &per_cpu(mce_banks_array, s->id)[bank];
-
if (!b->init)
return -ENODEV;
b->ctl = new;
+
+ mutex_lock(&mce_sysfs_mutex);
mce_restart();
+ mutex_unlock(&mce_sysfs_mutex);
return size;
}
@@ -2568,9 +2698,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
-
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
@@ -2665,8 +2792,6 @@ static void mce_reenable_cpu(void)
static int mce_cpu_dead(unsigned int cpu)
{
- mce_intel_hcpu_update(cpu);
-
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index a05ac0716ecf..8d023239ce18 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -264,15 +264,8 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
return put_user(mcelog->len, p);
- case MCE_GETCLEAR_FLAGS: {
- unsigned flags;
-
- do {
- flags = mcelog->flags;
- } while (cmpxchg(&mcelog->flags, flags, 0) != flags);
-
- return put_user(flags, p);
- }
+ case MCE_GETCLEAR_FLAGS:
+ return put_user(xchg(&mcelog->flags, 0), p);
default:
return -ENOTTY;
}
@@ -314,7 +307,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
/*
* Need to give user space some time to set everything up,
- * so do it a jiffie or two later everywhere.
+ * so do it a jiffy or two later everywhere.
*/
schedule_timeout(2);
@@ -331,7 +324,6 @@ static const struct file_operations mce_chrdev_ops = {
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
- .llseek = no_llseek,
};
static struct miscdevice mce_chrdev_device = {
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index fbe8b61c3413..3ca9c007a666 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -16,14 +16,14 @@
* used to save error information organized in a lock-less list.
*
* This memory pool is only to be used to save MCE records in MCE context.
- * MCE events are rare, so a fixed size memory pool should be enough. Use
- * 2 pages to save MCE events for now (~80 MCE records at most).
+ * MCE events are rare, so a fixed size memory pool should be enough.
+ * Allocate on a sliding scale based on number of CPUs.
*/
-#define MCE_POOLSZ (2 * PAGE_SIZE)
+#define MCE_MIN_ENTRIES 80
+#define MCE_PER_CPU 2
static struct gen_pool *mce_evt_pool;
static LLIST_HEAD(mce_event_llist);
-static char gen_pool_buf[MCE_POOLSZ];
/*
* Compare the record "t" with each of the records on list "l" to see if
@@ -31,15 +31,15 @@ static char gen_pool_buf[MCE_POOLSZ];
*/
static bool is_duplicate_mce_record(struct mce_evt_llist *t, struct mce_evt_llist *l)
{
+ struct mce_hw_err *err1, *err2;
struct mce_evt_llist *node;
- struct mce *m1, *m2;
- m1 = &t->mce;
+ err1 = &t->err;
llist_for_each_entry(node, &l->llnode, llnode) {
- m2 = &node->mce;
+ err2 = &node->err;
- if (!mce_cmp(m1, m2))
+ if (!mce_cmp(&err1->m, &err2->m))
return true;
}
return false;
@@ -73,8 +73,8 @@ struct llist_node *mce_gen_pool_prepare_records(void)
void mce_gen_pool_process(struct work_struct *__unused)
{
- struct llist_node *head;
struct mce_evt_llist *node, *tmp;
+ struct llist_node *head;
struct mce *mce;
head = llist_del_all(&mce_event_llist);
@@ -83,7 +83,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
head = llist_reverse_order(head);
llist_for_each_entry_safe(node, tmp, head, llnode) {
- mce = &node->mce;
+ mce = &node->err.m;
blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
}
@@ -94,54 +94,63 @@ bool mce_gen_pool_empty(void)
return llist_empty(&mce_event_llist);
}
-int mce_gen_pool_add(struct mce *mce)
+bool mce_gen_pool_add(struct mce_hw_err *err)
{
struct mce_evt_llist *node;
- if (filter_mce(mce))
- return -EINVAL;
+ if (filter_mce(&err->m))
+ return false;
if (!mce_evt_pool)
- return -EINVAL;
+ return false;
node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
if (!node) {
pr_warn_ratelimited("MCE records pool full!\n");
- return -ENOMEM;
+ return false;
}
- memcpy(&node->mce, mce, sizeof(*mce));
+ memcpy(&node->err, err, sizeof(*err));
llist_add(&node->llnode, &mce_event_llist);
- return 0;
+ return true;
}
-static int mce_gen_pool_create(void)
+static bool mce_gen_pool_create(void)
{
- struct gen_pool *tmpp;
- int ret = -ENOMEM;
-
- tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
- if (!tmpp)
- goto out;
+ int mce_numrecords, mce_poolsz, order;
+ struct gen_pool *gpool;
+ void *mce_pool;
+
+ order = order_base_2(sizeof(struct mce_evt_llist));
+ gpool = gen_pool_create(order, -1);
+ if (!gpool)
+ return false;
+
+ mce_numrecords = max(MCE_MIN_ENTRIES, num_possible_cpus() * MCE_PER_CPU);
+ mce_poolsz = mce_numrecords * (1 << order);
+ mce_pool = kmalloc(mce_poolsz, GFP_KERNEL);
+ if (!mce_pool) {
+ gen_pool_destroy(gpool);
+ return false;
+ }
- ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
- if (ret) {
- gen_pool_destroy(tmpp);
- goto out;
+ if (gen_pool_add(gpool, (unsigned long)mce_pool, mce_poolsz, -1)) {
+ gen_pool_destroy(gpool);
+ kfree(mce_pool);
+ return false;
}
- mce_evt_pool = tmpp;
+ mce_evt_pool = gpool;
-out:
- return ret;
+ return true;
}
-int mce_gen_pool_init(void)
+bool mce_gen_pool_init(void)
{
/* Just init mce_gen_pool once. */
if (mce_evt_pool)
- return 0;
+ return true;
return mce_gen_pool_create();
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 4d8d4bcf915d..313fe682db33 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -430,11 +430,9 @@ static void trigger_thr_int(void *info)
static u32 get_nbc_for_node(int node_id)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
u32 cores_per_node;
- cores_per_node = (c->x86_max_cores * smp_num_siblings) / amd_get_nodes_per_socket();
-
+ cores_per_node = topology_num_threads_per_package() / topology_amd_nodes_per_pkg();
return cores_per_node * node_id;
}
@@ -489,19 +487,24 @@ static void prepare_msrs(void *info)
wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
+
+ if (m.misc)
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
} else {
wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
+
+ if (m.misc)
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
}
}
static void do_inject(void)
{
- u64 mcg_status = 0;
unsigned int cpu = i_mce.extcpu;
+ struct mce_hw_err err;
+ u64 mcg_status = 0;
u8 b = i_mce.bank;
i_mce.tsc = rdtsc_ordered();
@@ -515,7 +518,8 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_log(&i_mce);
+ err.m = i_mce;
+ mce_log(&err);
return;
}
@@ -543,8 +547,8 @@ static void do_inject(void)
if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
b == 4 &&
boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(topology_die_id(cpu));
- cpu = get_nbc_for_node(topology_die_id(cpu));
+ toggle_nb_mca_mst_cpu(topology_amd_node_id(cpu));
+ cpu = get_nbc_for_node(topology_amd_node_id(cpu));
}
cpus_read_lock();
@@ -746,6 +750,7 @@ static void check_hw_inj_possible(void)
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+ wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
@@ -796,4 +801,5 @@ static void __exit inject_exit(void)
module_init(inject_init);
module_exit(inject_exit);
+MODULE_DESCRIPTION("Machine check injection support");
MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 52bce533ddcc..f863df0ff42c 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -13,7 +13,7 @@
#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -42,15 +42,6 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
-/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -63,29 +54,33 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
*/
static DEFINE_SPINLOCK(cmci_poll_lock);
+/* Linux non-storm CMCI threshold (may be overridden by BIOS) */
#define CMCI_THRESHOLD 1
-#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (HZ)
-#define CMCI_STORM_THRESHOLD 15
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
- CMCI_STORM_NONE,
- CMCI_STORM_ACTIVE,
- CMCI_STORM_SUBSIDED,
-};
+/*
+ * MCi_CTL2 threshold for each bank when there is no storm.
+ * Default value for each bank may have been set by BIOS.
+ */
+static u16 cmci_threshold[MAX_NR_BANKS];
-static atomic_t cmci_storm_on_cpus;
+/*
+ * High threshold to limit CMCI rate during storms. Max supported is
+ * 0x7FFF. Use this slightly smaller value so it has a distinctive
+ * signature when some asks "Why am I not seeing all corrected errors?"
+ * A high threshold is used instead of just disabling CMCI for a
+ * bank because both corrected and uncorrected errors may be logged
+ * in the same bank and signalled with CMCI. The threshold only applies
+ * to corrected errors, so keeping CMCI enabled means that uncorrected
+ * errors will still be processed in a timely fashion.
+ */
+#define CMCI_STORM_THRESHOLD 32749
-static int cmci_supported(int *banks)
+static bool cmci_supported(int *banks)
{
u64 cap;
if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
- return 0;
+ return false;
/*
* Vendor check is not strictly needed, but the initial
@@ -94,12 +89,13 @@ static int cmci_supported(int *banks)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
- return 0;
+ return false;
if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6)
- return 0;
+ return false;
+
rdmsrl(MSR_IA32_MCG_CAP, cap);
- *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+ *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK);
return !!(cap & MCG_CMCI_P);
}
@@ -134,204 +130,166 @@ static bool lmce_supported(void)
return tmp & FEAT_CTL_LMCE_ENABLED;
}
-bool mce_intel_cmci_poll(void)
+/*
+ * Set a new CMCI threshold value. Preserve the state of the
+ * MCI_CTL2_CMCI_EN bit in case this happens during a
+ * cmci_rediscover() operation.
+ */
+static void cmci_set_threshold(int bank, int thresh)
{
- if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return false;
-
- /*
- * Reset the counter if we've logged an error in the last poll
- * during the storm.
- */
- if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
- else
- this_cpu_dec(cmci_backoff_cnt);
+ unsigned long flags;
+ u64 val;
- return true;
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
-void mce_intel_hcpu_update(unsigned long cpu)
+void mce_intel_handle_storm(int bank, bool on)
{
- if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
- atomic_dec(&cmci_storm_on_cpus);
+ if (on)
+ cmci_set_threshold(bank, CMCI_STORM_THRESHOLD);
+ else
+ cmci_set_threshold(bank, cmci_threshold[bank]);
+}
- per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
-static void cmci_toggle_interrupt_mode(bool on)
+/*
+ * Check all the reasons why current CPU cannot claim
+ * ownership of a bank.
+ * 1: CPU already owns this bank
+ * 2: BIOS owns this bank
+ * 3: Some other CPU owns this bank
+ */
+static bool cmci_skip_bank(int bank, u64 *val)
{
- unsigned long flags, *owned;
- int bank;
- u64 val;
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ if (test_bit(bank, owned))
+ return true;
- if (on)
- val |= MCI_CTL2_CMCI_EN;
- else
- val &= ~MCI_CTL2_CMCI_EN;
+ /* Skip banks in firmware first mode */
+ if (test_bit(bank, mce_banks_ce_disabled))
+ return true;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), *val);
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
- if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
- (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
- mce_notify_irq();
- return CMCI_STORM_INTERVAL;
+ /* Already owned by someone else? */
+ if (*val & MCI_CTL2_CMCI_EN) {
+ clear_bit(bank, owned);
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ return true;
}
- switch (__this_cpu_read(cmci_storm_state)) {
- case CMCI_STORM_ACTIVE:
-
- /*
- * We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the timer
- * interval is back to our poll interval.
- */
- __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- if (!atomic_sub_return(1, &cmci_storm_on_cpus))
- pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ return false;
+}
- fallthrough;
+/*
+ * Decide which CMCI interrupt threshold to use:
+ * 1: If this bank is in storm mode from whichever CPU was
+ * the previous owner, stay in storm mode.
+ * 2: If ignoring any threshold set by BIOS, set Linux default
+ * 3: Try to honor BIOS threshold (unless buggy BIOS set it at zero).
+ */
+static u64 cmci_pick_threshold(u64 val, int *bios_zero_thresh)
+{
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ return val;
- case CMCI_STORM_SUBSIDED:
+ if (!mca_cfg.bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
/*
- * We wait for all CPUs to go back to SUBSIDED state. When that
- * happens we switch back to interrupt mode.
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
*/
- if (!atomic_read(&cmci_storm_on_cpus)) {
- __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_toggle_interrupt_mode(true);
- cmci_recheck();
- }
- return CMCI_POLL_INTERVAL;
- default:
-
- /* We have shiny weather. Let the poll do whatever it thinks. */
- return interval;
+ *bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
}
+
+ return val;
}
-static bool cmci_storm_detect(void)
+/*
+ * Try to claim ownership of a bank.
+ */
+static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_wrong_thresh)
{
- unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
- unsigned long ts = __this_cpu_read(cmci_time_stamp);
- unsigned long now = jiffies;
- int r;
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
- if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
- return true;
+ val |= MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
- cnt++;
- } else {
- cnt = 1;
- __this_cpu_write(cmci_time_stamp, now);
+ /* If the enable bit did not stick, this bank should be polled. */
+ if (!(val & MCI_CTL2_CMCI_EN)) {
+ WARN_ON(!test_bit(bank, this_cpu_ptr(mce_poll_banks)));
+ storm->banks[bank].poll_only = true;
+ return;
}
- __this_cpu_write(cmci_storm_cnt, cnt);
-
- if (cnt <= CMCI_STORM_THRESHOLD)
- return false;
- cmci_toggle_interrupt_mode(false);
- __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
- r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_STORM_INTERVAL);
- this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ /* This CPU successfully set the enable bit. */
+ set_bit(bank, (void *)this_cpu_ptr(&mce_banks_owned));
- if (r == 1)
- pr_notice("CMCI storm detected: switching to poll mode\n");
- return true;
-}
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) {
+ pr_notice("CPU%d BANK%d CMCI inherited storm\n", smp_processor_id(), bank);
+ mce_inherit_storm(bank);
+ cmci_storm_begin(bank);
+ } else {
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ }
-/*
- * The interrupt handler. This is called on every event.
- * Just call the poller directly to log any events.
- * This could in theory increase the threshold under high load,
- * but doesn't for now.
- */
-static void intel_threshold_interrupt(void)
-{
- if (cmci_storm_detect())
- return;
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ *bios_wrong_thresh = 1;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ /* Save default threshold for each bank */
+ if (cmci_threshold[bank] == 0)
+ cmci_threshold[bank] = val & MCI_CTL2_CMCI_THRESHOLD_MASK;
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
- * banks.
+ * banks. Called during initial bootstrap, and also for hotplug CPU operations
+ * to rediscover/reassign machine check banks.
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
+ int bios_wrong_thresh = 0;
unsigned long flags;
int i;
- int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
- if (test_bit(i, owned))
+ if (cmci_skip_bank(i, &val))
continue;
- /* Skip banks in firmware first mode */
- if (test_bit(i, mce_banks_ce_disabled))
- continue;
-
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Already owned by someone else? */
- if (val & MCI_CTL2_CMCI_EN) {
- clear_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- continue;
- }
-
- if (!mca_cfg.bios_cmci_threshold) {
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= CMCI_THRESHOLD;
- } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
- /*
- * If bios_cmci_threshold boot option was specified
- * but the threshold is zero, we'll try to initialize
- * it to 1.
- */
- bios_zero_thresh = 1;
- val |= CMCI_THRESHOLD;
- }
-
- val |= MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-
- /* Did the enable bit stick? -- the bank supports CMCI */
- if (val & MCI_CTL2_CMCI_EN) {
- set_bit(i, owned);
- __clear_bit(i, this_cpu_ptr(mce_poll_banks));
- /*
- * We are able to set thresholds for some banks that
- * had a threshold of 0. This means the BIOS has not
- * set the thresholds properly or does not work with
- * this boot option. Note down now and report later.
- */
- if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
- (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
- bios_wrong_thresh = 1;
- } else {
- WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
- }
+ val = cmci_pick_threshold(val, &bios_zero_thresh);
+ cmci_claim_bank(i, val, bios_zero_thresh, &bios_wrong_thresh);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
@@ -370,6 +328,9 @@ static void __cmci_disable_bank(int bank)
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
__clear_bit(bank, this_cpu_ptr(mce_banks_owned));
+
+ if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD)
+ cmci_storm_end(bank);
}
/*
@@ -495,10 +456,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
{
u64 error_control;
- switch (c->x86_model) {
- case INTEL_FAM6_SANDYBRIDGE_X:
- case INTEL_FAM6_IVYBRIDGE_X:
- case INTEL_FAM6_HASWELL_X:
+ switch (c->x86_vfm) {
+ case INTEL_SANDYBRIDGE_X:
+ case INTEL_IVYBRIDGE_X:
+ case INTEL_HASWELL_X:
if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
return;
error_control |= 2;
@@ -524,12 +485,11 @@ bool intel_filter_mce(struct mce *m)
struct cpuinfo_x86 *c = &boot_cpu_data;
/* MCE errata HSD131, HSM142, HSW131, BDM48, HSM142 and SKX37 */
- if ((c->x86 == 6) &&
- ((c->x86_model == INTEL_FAM6_HASWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_L) ||
- (c->x86_model == INTEL_FAM6_BROADWELL) ||
- (c->x86_model == INTEL_FAM6_HASWELL_G) ||
- (c->x86_model == INTEL_FAM6_SKYLAKE_X)) &&
+ if ((c->x86_vfm == INTEL_HASWELL ||
+ c->x86_vfm == INTEL_HASWELL_L ||
+ c->x86_vfm == INTEL_BROADWELL ||
+ c->x86_vfm == INTEL_HASWELL_G ||
+ c->x86_vfm == INTEL_SKYLAKE_X) &&
(m->bank == 0) &&
((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
return true;
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index e13a26c9c0ac..95a504ece43e 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -26,13 +26,13 @@ extern struct blocking_notifier_head x86_mce_decoder_chain;
struct mce_evt_llist {
struct llist_node llnode;
- struct mce mce;
+ struct mce_hw_err err;
};
void mce_gen_pool_process(struct work_struct *__unused);
bool mce_gen_pool_empty(void);
-int mce_gen_pool_add(struct mce *mce);
-int mce_gen_pool_init(void);
+bool mce_gen_pool_add(struct mce_hw_err *err);
+bool mce_gen_pool_init(void);
struct llist_node *mce_gen_pool_prepare_records(void);
int mce_severity(struct mce *a, struct pt_regs *regs, char **msg, bool is_excp);
@@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
+void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
@@ -51,9 +49,7 @@ void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
bool intel_mce_usable_address(struct mce *m);
#else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
@@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; }
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
-void mce_timer_kick(unsigned long interval);
+void mce_timer_kick(bool storm);
+
+#ifdef CONFIG_X86_MCE_THRESHOLD
+void cmci_storm_begin(unsigned int bank);
+void cmci_storm_end(unsigned int bank);
+void mce_track_storm(struct mce *mce);
+void mce_inherit_storm(unsigned int bank);
+bool mce_get_storm_mode(void);
+void mce_set_storm_mode(bool storm);
+#else
+static inline void cmci_storm_begin(unsigned int bank) {}
+static inline void cmci_storm_end(unsigned int bank) {}
+static inline void mce_track_storm(struct mce *mce) {}
+static inline void mce_inherit_storm(unsigned int bank) {}
+static inline bool mce_get_storm_mode(void) { return false; }
+static inline void mce_set_storm_mode(bool storm) {}
+#endif
+
+/*
+ * history: Bitmask tracking errors occurrence. Each set bit
+ * represents an error seen.
+ *
+ * timestamp: Last time (in jiffies) that the bank was polled.
+ * in_storm_mode: Is this bank in storm mode?
+ * poll_only: Bank does not support CMCI, skip storm tracking.
+ */
+struct storm_bank {
+ u64 history;
+ u64 timestamp;
+ bool in_storm_mode;
+ bool poll_only;
+};
+
+#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/* How many errors within the history buffer mark the start of a storm. */
+#define STORM_BEGIN_THRESHOLD 5
+
+/*
+ * How many polls of machine check bank without an error before declaring
+ * the storm is over. Since it is tracked by the bitmasks in the history
+ * field of struct storm_bank the mask is 30 bits [0 ... 29].
+ */
+#define STORM_END_POLL_THRESHOLD 29
+
+/*
+ * banks: per-cpu, per-bank details
+ * stormy_bank_count: count of MC banks in storm state
+ * poll_mode: CPU is in poll mode
+ */
+struct mca_storm_desc {
+ struct storm_bank banks[MAX_NR_BANKS];
+ u8 stormy_bank_count;
+ bool poll_mode;
+};
+
+DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
@@ -209,6 +261,8 @@ enum mca_msr {
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);
+void mce_prep_record_common(struct mce *m);
+void mce_prep_record_per_cpu(unsigned int cpu, struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index c4477162c07d..dac4d64dfb2a 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -12,7 +12,7 @@
#include <linux/uaccess.h>
#include <asm/mce.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/traps.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
@@ -39,20 +39,20 @@ static struct severity {
u64 mask;
u64 result;
unsigned char sev;
- unsigned char mcgmask;
- unsigned char mcgres;
+ unsigned short mcgmask;
+ unsigned short mcgres;
unsigned char ser;
unsigned char context;
unsigned char excp;
unsigned char covered;
- unsigned char cpu_model;
+ unsigned int cpu_vfm;
unsigned char cpu_minstepping;
unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
-#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
+#define VFM_STEPPING(m, s) .cpu_vfm = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -128,7 +128,7 @@ static struct severity {
MCESEV(
AO, "Uncorrected Patrol Scrub Error",
SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
- MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ VFM_STEPPING(INTEL_SKYLAKE_X, 4), BANK_RANGE(13, 18)
),
/* ignore OVER for UCNA */
@@ -174,6 +174,18 @@ static struct severity {
USER
),
MCESEV(
+ AR, "Data load error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
+ AR, "Instruction fetch error in SEAM non-root mode",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ MCGMASK(MCG_STATUS_SEAM_NR, MCG_STATUS_SEAM_NR),
+ KERNEL
+ ),
+ MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
@@ -290,7 +302,6 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
switch (fixup_type) {
case EX_TYPE_UACCESS:
- case EX_TYPE_COPY:
if (!copy_user)
return IN_KERNEL;
m->kflags |= MCE_IN_KERNEL_COPYIN;
@@ -386,7 +397,7 @@ static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char
continue;
if (s->excp && excp != s->excp)
continue;
- if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ if (s->cpu_vfm && boot_cpu_data.x86_vfm != s->cpu_vfm)
continue;
if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
continue;
diff --git a/arch/x86/kernel/cpu/mce/threshold.c b/arch/x86/kernel/cpu/mce/threshold.c
index ef4e7bb5fd88..f4a007616468 100644
--- a/arch/x86/kernel/cpu/mce/threshold.c
+++ b/arch/x86/kernel/cpu/mce/threshold.c
@@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
apic_eoi();
}
+
+DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
+
+void mce_inherit_storm(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ /*
+ * Previous CPU owning this bank had put it into storm mode,
+ * but the precise history of that storm is unknown. Assume
+ * the worst (all recent polls of the bank found a valid error
+ * logged). This will avoid the new owner prematurely declaring
+ * the storm has ended.
+ */
+ storm->banks[bank].history = ~0ull;
+ storm->banks[bank].timestamp = jiffies;
+}
+
+bool mce_get_storm_mode(void)
+{
+ return __this_cpu_read(storm_desc.poll_mode);
+}
+
+void mce_set_storm_mode(bool storm)
+{
+ __this_cpu_write(storm_desc.poll_mode, storm);
+}
+
+static void mce_handle_storm(unsigned int bank, bool on)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_handle_storm(bank, on);
+ break;
+ }
+}
+
+void cmci_storm_begin(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __set_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].in_storm_mode = true;
+
+ /*
+ * If this is the first bank on this CPU to enter storm mode
+ * start polling.
+ */
+ if (++storm->stormy_bank_count == 1)
+ mce_timer_kick(true);
+}
+
+void cmci_storm_end(unsigned int bank)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
+ storm->banks[bank].history = 0;
+ storm->banks[bank].in_storm_mode = false;
+
+ /* If no banks left in storm mode, stop polling. */
+ if (!--storm->stormy_bank_count)
+ mce_timer_kick(false);
+}
+
+void mce_track_storm(struct mce *mce)
+{
+ struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
+ unsigned long now = jiffies, delta;
+ unsigned int shift = 1;
+ u64 history = 0;
+
+ /* No tracking needed for banks that do not support CMCI */
+ if (storm->banks[mce->bank].poll_only)
+ return;
+
+ /*
+ * When a bank is in storm mode it is polled once per second and
+ * the history mask will record about the last minute of poll results.
+ * If it is not in storm mode, then the bank is only checked when
+ * there is a CMCI interrupt. Check how long it has been since
+ * this bank was last checked, and adjust the amount of "shift"
+ * to apply to history.
+ */
+ if (!storm->banks[mce->bank].in_storm_mode) {
+ delta = now - storm->banks[mce->bank].timestamp;
+ shift = (delta + HZ) / HZ;
+ }
+
+ /* If it has been a long time since the last poll, clear history. */
+ if (shift < NUM_HISTORY_BITS)
+ history = storm->banks[mce->bank].history << shift;
+
+ storm->banks[mce->bank].timestamp = now;
+
+ /* History keeps track of corrected errors. VAL=1 && UC=0 */
+ if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
+ history |= 1;
+
+ storm->banks[mce->bank].history = history;
+
+ if (storm->banks[mce->bank].in_storm_mode) {
+ if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, false);
+ cmci_storm_end(mce->bank);
+ } else {
+ if (hweight64(history) < STORM_BEGIN_THRESHOLD)
+ return;
+ printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
+ mce_handle_storm(mce->bank, true);
+ cmci_storm_begin(mce->bank);
+ }
+}
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 13b45b9c806d..138689b8e1d8 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -23,17 +23,22 @@
#include <linux/earlycpio.h>
#include <linux/firmware.h>
+#include <linux/bsearch.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/initrd.h>
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <crypto/sha2.h>
+
#include <asm/microcode.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#include <asm/setup.h>
#include <asm/cpu.h>
#include <asm/msr.h>
+#include <asm/tlb.h>
#include "internal.h"
@@ -84,13 +89,36 @@ struct microcode_amd {
unsigned int mpb[];
};
-#define PATCH_MAX_SIZE (3 * PAGE_SIZE)
-
static struct equiv_cpu_table {
unsigned int num_entries;
struct equiv_cpu_entry *entry;
} equiv_table;
+union zen_patch_rev {
+ struct {
+ __u32 rev : 8,
+ stepping : 4,
+ model : 4,
+ __reserved : 4,
+ ext_model : 4,
+ ext_fam : 8;
+ };
+ __u32 ucode_rev;
+};
+
+union cpuid_1_eax {
+ struct {
+ __u32 stepping : 4,
+ model : 4,
+ family : 4,
+ __reserved0 : 4,
+ ext_model : 4,
+ ext_fam : 8,
+ __reserved1 : 4;
+ };
+ __u32 full;
+};
+
/*
* This points to the current valid container of microcode patches which we will
* save from the initrd/builtin before jettisoning its contents. @mc is the
@@ -98,7 +126,6 @@ static struct equiv_cpu_table {
*/
struct cont_desc {
struct microcode_amd *mc;
- u32 cpuid_1_eax;
u32 psize;
u8 *data;
size_t size;
@@ -111,10 +138,149 @@ struct cont_desc {
static const char
ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
+/*
+ * This is CPUID(1).EAX on the BSP. It is used in two ways:
+ *
+ * 1. To ignore the equivalence table on Zen1 and newer.
+ *
+ * 2. To match which patches to load because the patch revision ID
+ * already contains the f/m/s for which the microcode is destined
+ * for.
+ */
+static u32 bsp_cpuid_1_eax __ro_after_init;
+
+static bool sha_check = true;
+
+struct patch_digest {
+ u32 patch_id;
+ u8 sha256[SHA256_DIGEST_SIZE];
+};
+
+#include "amd_shas.c"
+
+static int cmp_id(const void *key, const void *elem)
+{
+ struct patch_digest *pd = (struct patch_digest *)elem;
+ u32 patch_id = *(u32 *)key;
+
+ if (patch_id == pd->patch_id)
+ return 0;
+ else if (patch_id < pd->patch_id)
+ return -1;
+ else
+ return 1;
+}
+
+static bool need_sha_check(u32 cur_rev)
+{
+ switch (cur_rev >> 8) {
+ case 0x80012: return cur_rev <= 0x800126f; break;
+ case 0x80082: return cur_rev <= 0x800820f; break;
+ case 0x83010: return cur_rev <= 0x830107c; break;
+ case 0x86001: return cur_rev <= 0x860010e; break;
+ case 0x86081: return cur_rev <= 0x8608108; break;
+ case 0x87010: return cur_rev <= 0x8701034; break;
+ case 0x8a000: return cur_rev <= 0x8a0000a; break;
+ case 0xa0010: return cur_rev <= 0xa00107a; break;
+ case 0xa0011: return cur_rev <= 0xa0011da; break;
+ case 0xa0012: return cur_rev <= 0xa001243; break;
+ case 0xa0082: return cur_rev <= 0xa00820e; break;
+ case 0xa1011: return cur_rev <= 0xa101153; break;
+ case 0xa1012: return cur_rev <= 0xa10124e; break;
+ case 0xa1081: return cur_rev <= 0xa108109; break;
+ case 0xa2010: return cur_rev <= 0xa20102f; break;
+ case 0xa2012: return cur_rev <= 0xa201212; break;
+ case 0xa4041: return cur_rev <= 0xa404109; break;
+ case 0xa5000: return cur_rev <= 0xa500013; break;
+ case 0xa6012: return cur_rev <= 0xa60120a; break;
+ case 0xa7041: return cur_rev <= 0xa704109; break;
+ case 0xa7052: return cur_rev <= 0xa705208; break;
+ case 0xa7080: return cur_rev <= 0xa708009; break;
+ case 0xa70c0: return cur_rev <= 0xa70C009; break;
+ case 0xaa001: return cur_rev <= 0xaa00116; break;
+ case 0xaa002: return cur_rev <= 0xaa00218; break;
+ default: break;
+ }
+
+ pr_info("You should not be seeing this. Please send the following couple of lines to x86-<at>-kernel.org\n");
+ pr_info("CPUID(1).EAX: 0x%x, current revision: 0x%x\n", bsp_cpuid_1_eax, cur_rev);
+ return true;
+}
+
+static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsigned int len)
+{
+ struct patch_digest *pd = NULL;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state s;
+ int i;
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17 ||
+ x86_family(bsp_cpuid_1_eax) > 0x19)
+ return true;
+
+ if (!need_sha_check(cur_rev))
+ return true;
+
+ if (!sha_check)
+ return true;
+
+ pd = bsearch(&patch_id, phashes, ARRAY_SIZE(phashes), sizeof(struct patch_digest), cmp_id);
+ if (!pd) {
+ pr_err("No sha256 digest for patch ID: 0x%x found\n", patch_id);
+ return false;
+ }
+
+ sha256_init(&s);
+ sha256_update(&s, data, len);
+ sha256_final(&s, digest);
+
+ if (memcmp(digest, pd->sha256, sizeof(digest))) {
+ pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id);
+
+ for (i = 0; i < SHA256_DIGEST_SIZE; i++)
+ pr_cont("0x%x ", digest[i]);
+ pr_info("\n");
+
+ return false;
+ }
+
+ return true;
+}
+
+static u32 get_patch_level(void)
+{
+ u32 rev, dummy __always_unused;
+
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ return rev;
+}
+
+static union cpuid_1_eax ucode_rev_to_cpuid(unsigned int val)
+{
+ union zen_patch_rev p;
+ union cpuid_1_eax c;
+
+ p.ucode_rev = val;
+ c.full = 0;
+
+ c.stepping = p.stepping;
+ c.model = p.model;
+ c.ext_model = p.ext_model;
+ c.family = 0xf;
+ c.ext_fam = p.ext_fam;
+
+ return c;
+}
+
static u16 find_equiv_id(struct equiv_cpu_table *et, u32 sig)
{
unsigned int i;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return 0;
+
if (!et || !et->num_entries)
return 0;
@@ -161,6 +327,10 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
if (!verify_container(buf, buf_size))
return false;
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return true;
+
cont_type = hdr[1];
if (cont_type != UCODE_EQUIV_CPU_TABLE_TYPE) {
pr_debug("Wrong microcode container equivalence table type: %u.\n",
@@ -187,8 +357,7 @@ static bool verify_equivalence_table(const u8 *buf, size_t buf_size)
* On success, @sh_psize returns the patch size according to the section header,
* to the caller.
*/
-static bool
-__verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
+static bool __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
{
u32 p_type, p_size;
const u32 *hdr;
@@ -224,12 +393,13 @@ __verify_patch_section(const u8 *buf, size_t buf_size, u32 *sh_psize)
* exceed the per-family maximum). @sh_psize is the size read from the section
* header.
*/
-static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size)
+static bool __verify_patch_size(u32 sh_psize, size_t buf_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
u32 max_size;
if (family >= 0x15)
- return min_t(u32, sh_psize, buf_size);
+ goto ret;
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
@@ -243,13 +413,15 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
break;
default:
WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
- return 0;
+ return false;
}
- if (sh_psize > min_t(u32, buf_size, max_size))
- return 0;
+ if (sh_psize > max_size)
+ return false;
- return sh_psize;
+ret:
+ /* Working with the whole buffer so < is ok. */
+ return sh_psize <= buf_size;
}
/*
@@ -260,11 +432,10 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
* positive: patch is not for this family, skip it
* 0: success
*/
-static int
-verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
+static int verify_patch(const u8 *buf, size_t buf_size, u32 *patch_size)
{
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct microcode_header_amd *mc_hdr;
- unsigned int ret;
u32 sh_psize;
u16 proc_id;
u8 patch_fam;
@@ -288,8 +459,7 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return -1;
}
- ret = __verify_patch_size(family, sh_psize, buf_size);
- if (!ret) {
+ if (!__verify_patch_size(sh_psize, buf_size)) {
pr_debug("Per-family patch size mismatch.\n");
return -1;
}
@@ -310,10 +480,19 @@ verify_patch(u8 family, const u8 *buf, size_t buf_size, u32 *patch_size)
return 0;
}
+static bool mc_patch_matches(struct microcode_amd *mc, u16 eq_id)
+{
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return ucode_rev_to_cpuid(mc->hdr.patch_id).full == bsp_cpuid_1_eax;
+ else
+ return eq_id == mc->hdr.processor_rev_id;
+}
+
/*
* This scans the ucode blob for the proper container as we can have multiple
- * containers glued together. Returns the equivalence ID from the equivalence
- * table or 0 if none found.
+ * containers glued together.
+ *
* Returns the amount of bytes consumed while scanning. @desc contains all the
* data we're going to use in later stages of the application.
*/
@@ -338,7 +517,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
* doesn't contain a patch for the CPU, scan through the whole container
* so that it can be skipped in case there are other containers appended.
*/
- eq_id = find_equiv_id(&table, desc->cpuid_1_eax);
+ eq_id = find_equiv_id(&table, bsp_cpuid_1_eax);
buf += hdr[2] + CONTAINER_HDR_SZ;
size -= hdr[2] + CONTAINER_HDR_SZ;
@@ -352,7 +531,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
u32 patch_size;
int ret;
- ret = verify_patch(x86_family(desc->cpuid_1_eax), buf, size, &patch_size);
+ ret = verify_patch(buf, size, &patch_size);
if (ret < 0) {
/*
* Patch verification failed, skip to the next container, if
@@ -365,7 +544,7 @@ static size_t parse_container(u8 *ucode, size_t size, struct cont_desc *desc)
}
mc = (struct microcode_amd *)(buf + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id) {
+ if (mc_patch_matches(mc, eq_id)) {
desc->psize = patch_size;
desc->mc = mc;
}
@@ -415,59 +594,41 @@ static void scan_containers(u8 *ucode, size_t size, struct cont_desc *desc)
}
}
-static int __apply_microcode_amd(struct microcode_amd *mc)
+static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev,
+ unsigned int psize)
{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc->hdr.data_code);
+ unsigned long p_addr = (unsigned long)&mc->hdr.data_code;
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc->hdr.patch_id)
+ if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize))
return -1;
- return 0;
-}
-
-/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
- *
- * Returns true if container found (sets @desc), false otherwise.
- */
-static bool early_apply_microcode(u32 cpuid_1_eax, u32 old_rev, void *ucode, size_t size)
-{
- struct cont_desc desc = { 0 };
- struct microcode_amd *mc;
- bool ret = false;
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr);
- desc.cpuid_1_eax = cpuid_1_eax;
+ if (x86_family(bsp_cpuid_1_eax) == 0x17) {
+ unsigned long p_addr_end = p_addr + psize - 1;
- scan_containers(ucode, size, &desc);
+ invlpg(p_addr);
- mc = desc.mc;
- if (!mc)
- return ret;
+ /*
+ * Flush next page too if patch image is crossing a page
+ * boundary.
+ */
+ if (p_addr >> PAGE_SHIFT != p_addr_end >> PAGE_SHIFT)
+ invlpg(p_addr_end);
+ }
- /*
- * Allow application of the same revision to pick up SMT-specific
- * changes even if the revision of the other SMT thread is already
- * up-to-date.
- */
- if (old_rev > mc->hdr.patch_id)
- return ret;
+ /* verify patch application was successful */
+ *cur_rev = get_patch_level();
+ if (*cur_rev != mc->hdr.patch_id)
+ return false;
- return !__apply_microcode_amd(mc);
+ return true;
}
-static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
+ u8 family = x86_family(bsp_cpuid_1_eax);
struct firmware fw;
if (IS_ENABLED(CONFIG_X86_32))
@@ -486,85 +647,144 @@ static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
return false;
}
-static void __init find_blobs_in_containers(unsigned int cpuid_1_eax, struct cpio_data *ret)
+static bool __init find_blobs_in_containers(struct cpio_data *ret)
{
struct cpio_data cp;
+ bool found;
- if (!get_builtin_microcode(&cp, x86_family(cpuid_1_eax)))
+ if (!get_builtin_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
- *ret = cp;
+ found = cp.data && cp.size;
+ if (found)
+ *ret = cp;
+
+ return found;
}
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ */
void __init load_ucode_amd_bsp(struct early_load_data *ed, unsigned int cpuid_1_eax)
{
+ struct cont_desc desc = { };
+ struct microcode_amd *mc;
struct cpio_data cp = { };
- u32 dummy;
+ char buf[4];
+ u32 rev;
+
+ if (cmdline_find_option(boot_command_line, "microcode.amd_sha_check", buf, 4)) {
+ if (!strncmp(buf, "off", 3)) {
+ sha_check = false;
+ pr_warn_once("It is a very very bad idea to disable the blobs SHA check!\n");
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+ }
+ }
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->old_rev, dummy);
+ bsp_cpuid_1_eax = cpuid_1_eax;
+
+ rev = get_patch_level();
+ ed->old_rev = rev;
/* Needed in load_microcode_amd() */
ucode_cpu_info[0].cpu_sig.sig = cpuid_1_eax;
- find_blobs_in_containers(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
+ if (!find_blobs_in_containers(&cp))
return;
- if (early_apply_microcode(cpuid_1_eax, ed->old_rev, cp.data, cp.size))
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, ed->new_rev, dummy);
-}
-
-static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size);
-
-static int __init save_microcode_in_initrd(void)
-{
- unsigned int cpuid_1_eax = native_cpuid_eax(1);
- struct cpuinfo_x86 *c = &boot_cpu_data;
- struct cont_desc desc = { 0 };
- enum ucode_state ret;
- struct cpio_data cp;
-
- if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
- return 0;
+ scan_containers(cp.data, cp.size, &desc);
- find_blobs_in_containers(cpuid_1_eax, &cp);
- if (!(cp.data && cp.size))
- return -EINVAL;
+ mc = desc.mc;
+ if (!mc)
+ return;
- desc.cpuid_1_eax = cpuid_1_eax;
+ /*
+ * Allow application of the same revision to pick up SMT-specific
+ * changes even if the revision of the other SMT thread is already
+ * up-to-date.
+ */
+ if (ed->old_rev > mc->hdr.patch_id)
+ return;
- scan_containers(cp.data, cp.size, &desc);
- if (!desc.mc)
- return -EINVAL;
+ if (__apply_microcode_amd(mc, &rev, desc.psize))
+ ed->new_rev = rev;
+}
- ret = load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
- if (ret > UCODE_UPDATED)
- return -EINVAL;
+static inline bool patch_cpus_equivalent(struct ucode_patch *p,
+ struct ucode_patch *n,
+ bool ignore_stepping)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union cpuid_1_eax p_cid = ucode_rev_to_cpuid(p->patch_id);
+ union cpuid_1_eax n_cid = ucode_rev_to_cpuid(n->patch_id);
+
+ if (ignore_stepping) {
+ p_cid.stepping = 0;
+ n_cid.stepping = 0;
+ }
- return 0;
+ return p_cid.full == n_cid.full;
+ } else {
+ return p->equiv_cpu == n->equiv_cpu;
+ }
}
-early_initcall(save_microcode_in_initrd);
/*
* a small, trivial cache of per-family ucode patches
*/
-static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+static struct ucode_patch *cache_find_patch(struct ucode_cpu_info *uci, u16 equiv_cpu)
{
struct ucode_patch *p;
+ struct ucode_patch n;
+
+ n.equiv_cpu = equiv_cpu;
+ n.patch_id = uci->cpu_sig.rev;
+
+ WARN_ON_ONCE(!n.patch_id);
list_for_each_entry(p, &microcode_cache, plist)
- if (p->equiv_cpu == equiv_cpu)
+ if (patch_cpus_equivalent(p, &n, false))
return p;
+
return NULL;
}
+static inline int patch_newer(struct ucode_patch *p, struct ucode_patch *n)
+{
+ /* Zen and newer hardcode the f/m/s in the patch ID */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17) {
+ union zen_patch_rev zp, zn;
+
+ zp.ucode_rev = p->patch_id;
+ zn.ucode_rev = n->patch_id;
+
+ if (zn.stepping != zp.stepping)
+ return -1;
+
+ return zn.rev > zp.rev;
+ } else {
+ return n->patch_id > p->patch_id;
+ }
+}
+
static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
+ int ret;
list_for_each_entry(p, &microcode_cache, plist) {
- if (p->equiv_cpu == new_patch->equiv_cpu) {
- if (p->patch_id >= new_patch->patch_id) {
+ if (patch_cpus_equivalent(p, new_patch, true)) {
+ ret = patch_newer(p, new_patch);
+ if (ret < 0)
+ continue;
+ else if (!ret) {
/* we already have the latest patch */
kfree(new_patch->data);
kfree(new_patch);
@@ -595,13 +815,17 @@ static void free_cache(void)
static struct ucode_patch *find_patch(unsigned int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u16 equiv_id;
+ u16 equiv_id = 0;
- equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
- if (!equiv_id)
- return NULL;
+ uci->cpu_sig.rev = get_patch_level();
+
+ if (x86_family(bsp_cpuid_1_eax) < 0x17) {
+ equiv_id = find_equiv_id(&equiv_table, uci->cpu_sig.sig);
+ if (!equiv_id)
+ return NULL;
+ }
- return cache_find_patch(equiv_id);
+ return cache_find_patch(uci, equiv_id);
}
void reload_ucode_amd(unsigned int cpu)
@@ -616,22 +840,20 @@ void reload_ucode_amd(unsigned int cpu)
mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
+ rev = get_patch_level();
if (rev < mc->hdr.patch_id) {
- if (!__apply_microcode_amd(mc))
- pr_info_once("reload revision: 0x%08x\n", mc->hdr.patch_id);
+ if (__apply_microcode_amd(mc, &rev, p->size))
+ pr_info_once("reload revision: 0x%08x\n", rev);
}
}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct ucode_patch *p;
csig->sig = cpuid_eax(0x00000001);
- csig->rev = c->microcode;
+ csig->rev = get_patch_level();
/*
* a patch could have been loaded early, set uci->mc so that
@@ -651,7 +873,7 @@ static enum ucode_state apply_microcode_amd(int cpu)
struct ucode_cpu_info *uci;
struct ucode_patch *p;
enum ucode_state ret;
- u32 rev, dummy __always_unused;
+ u32 rev;
BUG_ON(raw_smp_processor_id() != cpu);
@@ -661,18 +883,18 @@ static enum ucode_state apply_microcode_amd(int cpu)
if (!p)
return UCODE_NFOUND;
+ rev = uci->cpu_sig.rev;
+
mc_amd = p->data;
uci->mc = p->data;
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-
/* need to apply patch? */
if (rev > mc_amd->hdr.patch_id) {
ret = UCODE_OK;
goto out;
}
- if (__apply_microcode_amd(mc_amd)) {
+ if (!__apply_microcode_amd(mc_amd, &rev, p->size)) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
return UCODE_ERROR;
@@ -711,6 +933,10 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
hdr = (const u32 *)buf;
equiv_tbl_len = hdr[2];
+ /* Zen and newer do not need an equivalence table. */
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ goto out;
+
equiv_table.entry = vmalloc(equiv_tbl_len);
if (!equiv_table.entry) {
pr_err("failed to allocate equivalent CPU table\n");
@@ -720,12 +946,16 @@ static size_t install_equiv_cpu_table(const u8 *buf, size_t buf_size)
memcpy(equiv_table.entry, buf + CONTAINER_HDR_SZ, equiv_tbl_len);
equiv_table.num_entries = equiv_tbl_len / sizeof(struct equiv_cpu_entry);
+out:
/* add header length */
return equiv_tbl_len + CONTAINER_HDR_SZ;
}
static void free_equiv_cpu_table(void)
{
+ if (x86_family(bsp_cpuid_1_eax) >= 0x17)
+ return;
+
vfree(equiv_table.entry);
memset(&equiv_table, 0, sizeof(equiv_table));
}
@@ -751,7 +981,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
u16 proc_id;
int ret;
- ret = verify_patch(family, fw, leftover, patch_size);
+ ret = verify_patch(fw, leftover, patch_size);
if (ret)
return ret;
@@ -776,7 +1006,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
- pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ pr_debug("%s: Adding patch_id: 0x%08x, proc_id: 0x%04x\n",
__func__, patch->patch_id, proc_id);
/* ... and add to cache. */
@@ -786,8 +1016,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
}
/* Scan the blob in @data and add microcode patches to the cache. */
-static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
- size_t size)
+static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, size_t size)
{
u8 *fw = (u8 *)data;
size_t offset;
@@ -820,23 +1049,32 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+static enum ucode_state _load_microcode_amd(u8 family, const u8 *data, size_t size)
{
- struct cpuinfo_x86 *c;
- unsigned int nid, cpu;
- struct ucode_patch *p;
enum ucode_state ret;
/* free old equiv table */
free_equiv_cpu_table();
ret = __load_microcode_amd(family, data, size);
- if (ret != UCODE_OK) {
+ if (ret != UCODE_OK)
cleanup();
+
+ return ret;
+}
+
+static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
+{
+ struct cpuinfo_x86 *c;
+ unsigned int nid, cpu;
+ struct ucode_patch *p;
+ enum ucode_state ret;
+
+ ret = _load_microcode_amd(family, data, size);
+ if (ret != UCODE_OK)
return ret;
- }
- for_each_node(nid) {
+ for_each_node_with_cpus(nid) {
cpu = cpumask_first(cpumask_of_node(nid));
c = &cpu_data(cpu);
@@ -853,6 +1091,32 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz
return ret;
}
+static int __init save_microcode_in_initrd(void)
+{
+ unsigned int cpuid_1_eax = native_cpuid_eax(1);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ struct cont_desc desc = { 0 };
+ enum ucode_state ret;
+ struct cpio_data cp;
+
+ if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10)
+ return 0;
+
+ if (!find_blobs_in_containers(&cp))
+ return -EINVAL;
+
+ scan_containers(cp.data, cp.size, &desc);
+ if (!desc.mc)
+ return -EINVAL;
+
+ ret = _load_microcode_amd(x86_family(cpuid_1_eax), desc.data, desc.size);
+ if (ret > UCODE_UPDATED)
+ return -EINVAL;
+
+ return 0;
+}
+early_initcall(save_microcode_in_initrd);
+
/*
* AMD microcode firmware naming convention, up to family 15h they are in
* the legacy file:
diff --git a/arch/x86/kernel/cpu/microcode/amd_shas.c b/arch/x86/kernel/cpu/microcode/amd_shas.c
new file mode 100644
index 000000000000..2a1655b1fdd8
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/amd_shas.c
@@ -0,0 +1,444 @@
+/* Keep 'em sorted. */
+static const struct patch_digest phashes[] = {
+ { 0x8001227, {
+ 0x99,0xc0,0x9b,0x2b,0xcc,0x9f,0x52,0x1b,
+ 0x1a,0x5f,0x1d,0x83,0xa1,0x6c,0xc4,0x46,
+ 0xe2,0x6c,0xda,0x73,0xfb,0x2d,0x23,0xa8,
+ 0x77,0xdc,0x15,0x31,0x33,0x4a,0x46,0x18,
+ }
+ },
+ { 0x8001250, {
+ 0xc0,0x0b,0x6b,0x19,0xfd,0x5c,0x39,0x60,
+ 0xd5,0xc3,0x57,0x46,0x54,0xe4,0xd1,0xaa,
+ 0xa8,0xf7,0x1f,0xa8,0x6a,0x60,0x3e,0xe3,
+ 0x27,0x39,0x8e,0x53,0x30,0xf8,0x49,0x19,
+ }
+ },
+ { 0x800126e, {
+ 0xf3,0x8b,0x2b,0xb6,0x34,0xe3,0xc8,0x2c,
+ 0xef,0xec,0x63,0x6d,0xc8,0x76,0x77,0xb3,
+ 0x25,0x5a,0xb7,0x52,0x8c,0x83,0x26,0xe6,
+ 0x4c,0xbe,0xbf,0xe9,0x7d,0x22,0x6a,0x43,
+ }
+ },
+ { 0x800126f, {
+ 0x2b,0x5a,0xf2,0x9c,0xdd,0xd2,0x7f,0xec,
+ 0xec,0x96,0x09,0x57,0xb0,0x96,0x29,0x8b,
+ 0x2e,0x26,0x91,0xf0,0x49,0x33,0x42,0x18,
+ 0xdd,0x4b,0x65,0x5a,0xd4,0x15,0x3d,0x33,
+ }
+ },
+ { 0x800820d, {
+ 0x68,0x98,0x83,0xcd,0x22,0x0d,0xdd,0x59,
+ 0x73,0x2c,0x5b,0x37,0x1f,0x84,0x0e,0x67,
+ 0x96,0x43,0x83,0x0c,0x46,0x44,0xab,0x7c,
+ 0x7b,0x65,0x9e,0x57,0xb5,0x90,0x4b,0x0e,
+ }
+ },
+ { 0x8301025, {
+ 0xe4,0x7d,0xdb,0x1e,0x14,0xb4,0x5e,0x36,
+ 0x8f,0x3e,0x48,0x88,0x3c,0x6d,0x76,0xa1,
+ 0x59,0xc6,0xc0,0x72,0x42,0xdf,0x6c,0x30,
+ 0x6f,0x0b,0x28,0x16,0x61,0xfc,0x79,0x77,
+ }
+ },
+ { 0x8301055, {
+ 0x81,0x7b,0x99,0x1b,0xae,0x2d,0x4f,0x9a,
+ 0xef,0x13,0xce,0xb5,0x10,0xaf,0x6a,0xea,
+ 0xe5,0xb0,0x64,0x98,0x10,0x68,0x34,0x3b,
+ 0x9d,0x7a,0xd6,0x22,0x77,0x5f,0xb3,0x5b,
+ }
+ },
+ { 0x8301072, {
+ 0xcf,0x76,0xa7,0x1a,0x49,0xdf,0x2a,0x5e,
+ 0x9e,0x40,0x70,0xe5,0xdd,0x8a,0xa8,0x28,
+ 0x20,0xdc,0x91,0xd8,0x2c,0xa6,0xa0,0xb1,
+ 0x2d,0x22,0x26,0x94,0x4b,0x40,0x85,0x30,
+ }
+ },
+ { 0x830107a, {
+ 0x2a,0x65,0x8c,0x1a,0x5e,0x07,0x21,0x72,
+ 0xdf,0x90,0xa6,0x51,0x37,0xd3,0x4b,0x34,
+ 0xc4,0xda,0x03,0xe1,0x8a,0x6c,0xfb,0x20,
+ 0x04,0xb2,0x81,0x05,0xd4,0x87,0xf4,0x0a,
+ }
+ },
+ { 0x830107b, {
+ 0xb3,0x43,0x13,0x63,0x56,0xc1,0x39,0xad,
+ 0x10,0xa6,0x2b,0xcc,0x02,0xe6,0x76,0x2a,
+ 0x1e,0x39,0x58,0x3e,0x23,0x6e,0xa4,0x04,
+ 0x95,0xea,0xf9,0x6d,0xc2,0x8a,0x13,0x19,
+ }
+ },
+ { 0x830107c, {
+ 0x21,0x64,0xde,0xfb,0x9f,0x68,0x96,0x47,
+ 0x70,0x5c,0xe2,0x8f,0x18,0x52,0x6a,0xac,
+ 0xa4,0xd2,0x2e,0xe0,0xde,0x68,0x66,0xc3,
+ 0xeb,0x1e,0xd3,0x3f,0xbc,0x51,0x1d,0x38,
+ }
+ },
+ { 0x860010d, {
+ 0x86,0xb6,0x15,0x83,0xbc,0x3b,0x9c,0xe0,
+ 0xb3,0xef,0x1d,0x99,0x84,0x35,0x15,0xf7,
+ 0x7c,0x2a,0xc6,0x42,0xdb,0x73,0x07,0x5c,
+ 0x7d,0xc3,0x02,0xb5,0x43,0x06,0x5e,0xf8,
+ }
+ },
+ { 0x8608108, {
+ 0x14,0xfe,0x57,0x86,0x49,0xc8,0x68,0xe2,
+ 0x11,0xa3,0xcb,0x6e,0xff,0x6e,0xd5,0x38,
+ 0xfe,0x89,0x1a,0xe0,0x67,0xbf,0xc4,0xcc,
+ 0x1b,0x9f,0x84,0x77,0x2b,0x9f,0xaa,0xbd,
+ }
+ },
+ { 0x8701034, {
+ 0xc3,0x14,0x09,0xa8,0x9c,0x3f,0x8d,0x83,
+ 0x9b,0x4c,0xa5,0xb7,0x64,0x8b,0x91,0x5d,
+ 0x85,0x6a,0x39,0x26,0x1e,0x14,0x41,0xa8,
+ 0x75,0xea,0xa6,0xf9,0xc9,0xd1,0xea,0x2b,
+ }
+ },
+ { 0x8a00008, {
+ 0xd7,0x2a,0x93,0xdc,0x05,0x2f,0xa5,0x6e,
+ 0x0c,0x61,0x2c,0x07,0x9f,0x38,0xe9,0x8e,
+ 0xef,0x7d,0x2a,0x05,0x4d,0x56,0xaf,0x72,
+ 0xe7,0x56,0x47,0x6e,0x60,0x27,0xd5,0x8c,
+ }
+ },
+ { 0x8a0000a, {
+ 0x73,0x31,0x26,0x22,0xd4,0xf9,0xee,0x3c,
+ 0x07,0x06,0xe7,0xb9,0xad,0xd8,0x72,0x44,
+ 0x33,0x31,0xaa,0x7d,0xc3,0x67,0x0e,0xdb,
+ 0x47,0xb5,0xaa,0xbc,0xf5,0xbb,0xd9,0x20,
+ }
+ },
+ { 0xa00104c, {
+ 0x3c,0x8a,0xfe,0x04,0x62,0xd8,0x6d,0xbe,
+ 0xa7,0x14,0x28,0x64,0x75,0xc0,0xa3,0x76,
+ 0xb7,0x92,0x0b,0x97,0x0a,0x8e,0x9c,0x5b,
+ 0x1b,0xc8,0x9d,0x3a,0x1e,0x81,0x3d,0x3b,
+ }
+ },
+ { 0xa00104e, {
+ 0xc4,0x35,0x82,0x67,0xd2,0x86,0xe5,0xb2,
+ 0xfd,0x69,0x12,0x38,0xc8,0x77,0xba,0xe0,
+ 0x70,0xf9,0x77,0x89,0x10,0xa6,0x74,0x4e,
+ 0x56,0x58,0x13,0xf5,0x84,0x70,0x28,0x0b,
+ }
+ },
+ { 0xa001053, {
+ 0x92,0x0e,0xf4,0x69,0x10,0x3b,0xf9,0x9d,
+ 0x31,0x1b,0xa6,0x99,0x08,0x7d,0xd7,0x25,
+ 0x7e,0x1e,0x89,0xba,0x35,0x8d,0xac,0xcb,
+ 0x3a,0xb4,0xdf,0x58,0x12,0xcf,0xc0,0xc3,
+ }
+ },
+ { 0xa001058, {
+ 0x33,0x7d,0xa9,0xb5,0x4e,0x62,0x13,0x36,
+ 0xef,0x66,0xc9,0xbd,0x0a,0xa6,0x3b,0x19,
+ 0xcb,0xf5,0xc2,0xc3,0x55,0x47,0x20,0xec,
+ 0x1f,0x7b,0xa1,0x44,0x0e,0x8e,0xa4,0xb2,
+ }
+ },
+ { 0xa001075, {
+ 0x39,0x02,0x82,0xd0,0x7c,0x26,0x43,0xe9,
+ 0x26,0xa3,0xd9,0x96,0xf7,0x30,0x13,0x0a,
+ 0x8a,0x0e,0xac,0xe7,0x1d,0xdc,0xe2,0x0f,
+ 0xcb,0x9e,0x8d,0xbc,0xd2,0xa2,0x44,0xe0,
+ }
+ },
+ { 0xa001078, {
+ 0x2d,0x67,0xc7,0x35,0xca,0xef,0x2f,0x25,
+ 0x4c,0x45,0x93,0x3f,0x36,0x01,0x8c,0xce,
+ 0xa8,0x5b,0x07,0xd3,0xc1,0x35,0x3c,0x04,
+ 0x20,0xa2,0xfc,0xdc,0xe6,0xce,0x26,0x3e,
+ }
+ },
+ { 0xa001079, {
+ 0x43,0xe2,0x05,0x9c,0xfd,0xb7,0x5b,0xeb,
+ 0x5b,0xe9,0xeb,0x3b,0x96,0xf4,0xe4,0x93,
+ 0x73,0x45,0x3e,0xac,0x8d,0x3b,0xe4,0xdb,
+ 0x10,0x31,0xc1,0xe4,0xa2,0xd0,0x5a,0x8a,
+ }
+ },
+ { 0xa00107a, {
+ 0x5f,0x92,0xca,0xff,0xc3,0x59,0x22,0x5f,
+ 0x02,0xa0,0x91,0x3b,0x4a,0x45,0x10,0xfd,
+ 0x19,0xe1,0x8a,0x6d,0x9a,0x92,0xc1,0x3f,
+ 0x75,0x78,0xac,0x78,0x03,0x1d,0xdb,0x18,
+ }
+ },
+ { 0xa001143, {
+ 0x56,0xca,0xf7,0x43,0x8a,0x4c,0x46,0x80,
+ 0xec,0xde,0xe5,0x9c,0x50,0x84,0x9a,0x42,
+ 0x27,0xe5,0x51,0x84,0x8f,0x19,0xc0,0x8d,
+ 0x0c,0x25,0xb4,0xb0,0x8f,0x10,0xf3,0xf8,
+ }
+ },
+ { 0xa001144, {
+ 0x42,0xd5,0x9b,0xa7,0xd6,0x15,0x29,0x41,
+ 0x61,0xc4,0x72,0x3f,0xf3,0x06,0x78,0x4b,
+ 0x65,0xf3,0x0e,0xfa,0x9c,0x87,0xde,0x25,
+ 0xbd,0xb3,0x9a,0xf4,0x75,0x13,0x53,0xdc,
+ }
+ },
+ { 0xa00115d, {
+ 0xd4,0xc4,0x49,0x36,0x89,0x0b,0x47,0xdd,
+ 0xfb,0x2f,0x88,0x3b,0x5f,0xf2,0x8e,0x75,
+ 0xc6,0x6c,0x37,0x5a,0x90,0x25,0x94,0x3e,
+ 0x36,0x9c,0xae,0x02,0x38,0x6c,0xf5,0x05,
+ }
+ },
+ { 0xa001173, {
+ 0x28,0xbb,0x9b,0xd1,0xa0,0xa0,0x7e,0x3a,
+ 0x59,0x20,0xc0,0xa9,0xb2,0x5c,0xc3,0x35,
+ 0x53,0x89,0xe1,0x4c,0x93,0x2f,0x1d,0xc3,
+ 0xe5,0xf7,0xf3,0xc8,0x9b,0x61,0xaa,0x9e,
+ }
+ },
+ { 0xa0011a8, {
+ 0x97,0xc6,0x16,0x65,0x99,0xa4,0x85,0x3b,
+ 0xf6,0xce,0xaa,0x49,0x4a,0x3a,0xc5,0xb6,
+ 0x78,0x25,0xbc,0x53,0xaf,0x5d,0xcf,0xf4,
+ 0x23,0x12,0xbb,0xb1,0xbc,0x8a,0x02,0x2e,
+ }
+ },
+ { 0xa0011ce, {
+ 0xcf,0x1c,0x90,0xa3,0x85,0x0a,0xbf,0x71,
+ 0x94,0x0e,0x80,0x86,0x85,0x4f,0xd7,0x86,
+ 0xae,0x38,0x23,0x28,0x2b,0x35,0x9b,0x4e,
+ 0xfe,0xb8,0xcd,0x3d,0x3d,0x39,0xc9,0x6a,
+ }
+ },
+ { 0xa0011d1, {
+ 0xdf,0x0e,0xca,0xde,0xf6,0xce,0x5c,0x1e,
+ 0x4c,0xec,0xd7,0x71,0x83,0xcc,0xa8,0x09,
+ 0xc7,0xc5,0xfe,0xb2,0xf7,0x05,0xd2,0xc5,
+ 0x12,0xdd,0xe4,0xf3,0x92,0x1c,0x3d,0xb8,
+ }
+ },
+ { 0xa0011d3, {
+ 0x91,0xe6,0x10,0xd7,0x57,0xb0,0x95,0x0b,
+ 0x9a,0x24,0xee,0xf7,0xcf,0x56,0xc1,0xa6,
+ 0x4a,0x52,0x7d,0x5f,0x9f,0xdf,0xf6,0x00,
+ 0x65,0xf7,0xea,0xe8,0x2a,0x88,0xe2,0x26,
+ }
+ },
+ { 0xa0011d5, {
+ 0xed,0x69,0x89,0xf4,0xeb,0x64,0xc2,0x13,
+ 0xe0,0x51,0x1f,0x03,0x26,0x52,0x7d,0xb7,
+ 0x93,0x5d,0x65,0xca,0xb8,0x12,0x1d,0x62,
+ 0x0d,0x5b,0x65,0x34,0x69,0xb2,0x62,0x21,
+ }
+ },
+ { 0xa001223, {
+ 0xfb,0x32,0x5f,0xc6,0x83,0x4f,0x8c,0xb8,
+ 0xa4,0x05,0xf9,0x71,0x53,0x01,0x16,0xc4,
+ 0x83,0x75,0x94,0xdd,0xeb,0x7e,0xb7,0x15,
+ 0x8e,0x3b,0x50,0x29,0x8a,0x9c,0xcc,0x45,
+ }
+ },
+ { 0xa001224, {
+ 0x0e,0x0c,0xdf,0xb4,0x89,0xee,0x35,0x25,
+ 0xdd,0x9e,0xdb,0xc0,0x69,0x83,0x0a,0xad,
+ 0x26,0xa9,0xaa,0x9d,0xfc,0x3c,0xea,0xf9,
+ 0x6c,0xdc,0xd5,0x6d,0x8b,0x6e,0x85,0x4a,
+ }
+ },
+ { 0xa001227, {
+ 0xab,0xc6,0x00,0x69,0x4b,0x50,0x87,0xad,
+ 0x5f,0x0e,0x8b,0xea,0x57,0x38,0xce,0x1d,
+ 0x0f,0x75,0x26,0x02,0xf6,0xd6,0x96,0xe9,
+ 0x87,0xb9,0xd6,0x20,0x27,0x7c,0xd2,0xe0,
+ }
+ },
+ { 0xa001229, {
+ 0x7f,0x49,0x49,0x48,0x46,0xa5,0x50,0xa6,
+ 0x28,0x89,0x98,0xe2,0x9e,0xb4,0x7f,0x75,
+ 0x33,0xa7,0x04,0x02,0xe4,0x82,0xbf,0xb4,
+ 0xa5,0x3a,0xba,0x24,0x8d,0x31,0x10,0x1d,
+ }
+ },
+ { 0xa00122e, {
+ 0x56,0x94,0xa9,0x5d,0x06,0x68,0xfe,0xaf,
+ 0xdf,0x7a,0xff,0x2d,0xdf,0x74,0x0f,0x15,
+ 0x66,0xfb,0x00,0xb5,0x51,0x97,0x9b,0xfa,
+ 0xcb,0x79,0x85,0x46,0x25,0xb4,0xd2,0x10,
+ }
+ },
+ { 0xa001231, {
+ 0x0b,0x46,0xa5,0xfc,0x18,0x15,0xa0,0x9e,
+ 0xa6,0xdc,0xb7,0xff,0x17,0xf7,0x30,0x64,
+ 0xd4,0xda,0x9e,0x1b,0xc3,0xfc,0x02,0x3b,
+ 0xe2,0xc6,0x0e,0x41,0x54,0xb5,0x18,0xdd,
+ }
+ },
+ { 0xa001234, {
+ 0x88,0x8d,0xed,0xab,0xb5,0xbd,0x4e,0xf7,
+ 0x7f,0xd4,0x0e,0x95,0x34,0x91,0xff,0xcc,
+ 0xfb,0x2a,0xcd,0xf7,0xd5,0xdb,0x4c,0x9b,
+ 0xd6,0x2e,0x73,0x50,0x8f,0x83,0x79,0x1a,
+ }
+ },
+ { 0xa001236, {
+ 0x3d,0x30,0x00,0xb9,0x71,0xba,0x87,0x78,
+ 0xa8,0x43,0x55,0xc4,0x26,0x59,0xcf,0x9d,
+ 0x93,0xce,0x64,0x0e,0x8b,0x72,0x11,0x8b,
+ 0xa3,0x8f,0x51,0xe9,0xca,0x98,0xaa,0x25,
+ }
+ },
+ { 0xa001238, {
+ 0x72,0xf7,0x4b,0x0c,0x7d,0x58,0x65,0xcc,
+ 0x00,0xcc,0x57,0x16,0x68,0x16,0xf8,0x2a,
+ 0x1b,0xb3,0x8b,0xe1,0xb6,0x83,0x8c,0x7e,
+ 0xc0,0xcd,0x33,0xf2,0x8d,0xf9,0xef,0x59,
+ }
+ },
+ { 0xa00820c, {
+ 0xa8,0x0c,0x81,0xc0,0xa6,0x00,0xe7,0xf3,
+ 0x5f,0x65,0xd3,0xb9,0x6f,0xea,0x93,0x63,
+ 0xf1,0x8c,0x88,0x45,0xd7,0x82,0x80,0xd1,
+ 0xe1,0x3b,0x8d,0xb2,0xf8,0x22,0x03,0xe2,
+ }
+ },
+ { 0xa10113e, {
+ 0x05,0x3c,0x66,0xd7,0xa9,0x5a,0x33,0x10,
+ 0x1b,0xf8,0x9c,0x8f,0xed,0xfc,0xa7,0xa0,
+ 0x15,0xe3,0x3f,0x4b,0x1d,0x0d,0x0a,0xd5,
+ 0xfa,0x90,0xc4,0xed,0x9d,0x90,0xaf,0x53,
+ }
+ },
+ { 0xa101144, {
+ 0xb3,0x0b,0x26,0x9a,0xf8,0x7c,0x02,0x26,
+ 0x35,0x84,0x53,0xa4,0xd3,0x2c,0x7c,0x09,
+ 0x68,0x7b,0x96,0xb6,0x93,0xef,0xde,0xbc,
+ 0xfd,0x4b,0x15,0xd2,0x81,0xd3,0x51,0x47,
+ }
+ },
+ { 0xa101148, {
+ 0x20,0xd5,0x6f,0x40,0x4a,0xf6,0x48,0x90,
+ 0xc2,0x93,0x9a,0xc2,0xfd,0xac,0xef,0x4f,
+ 0xfa,0xc0,0x3d,0x92,0x3c,0x6d,0x01,0x08,
+ 0xf1,0x5e,0xb0,0xde,0xb4,0x98,0xae,0xc4,
+ }
+ },
+ { 0xa10123e, {
+ 0x03,0xb9,0x2c,0x76,0x48,0x93,0xc9,0x18,
+ 0xfb,0x56,0xfd,0xf7,0xe2,0x1d,0xca,0x4d,
+ 0x1d,0x13,0x53,0x63,0xfe,0x42,0x6f,0xfc,
+ 0x19,0x0f,0xf1,0xfc,0xa7,0xdd,0x89,0x1b,
+ }
+ },
+ { 0xa101244, {
+ 0x71,0x56,0xb5,0x9f,0x21,0xbf,0xb3,0x3c,
+ 0x8c,0xd7,0x36,0xd0,0x34,0x52,0x1b,0xb1,
+ 0x46,0x2f,0x04,0xf0,0x37,0xd8,0x1e,0x72,
+ 0x24,0xa2,0x80,0x84,0x83,0x65,0x84,0xc0,
+ }
+ },
+ { 0xa101248, {
+ 0xed,0x3b,0x95,0xa6,0x68,0xa7,0x77,0x3e,
+ 0xfc,0x17,0x26,0xe2,0x7b,0xd5,0x56,0x22,
+ 0x2c,0x1d,0xef,0xeb,0x56,0xdd,0xba,0x6e,
+ 0x1b,0x7d,0x64,0x9d,0x4b,0x53,0x13,0x75,
+ }
+ },
+ { 0xa108108, {
+ 0xed,0xc2,0xec,0xa1,0x15,0xc6,0x65,0xe9,
+ 0xd0,0xef,0x39,0xaa,0x7f,0x55,0x06,0xc6,
+ 0xf5,0xd4,0x3f,0x7b,0x14,0xd5,0x60,0x2c,
+ 0x28,0x1e,0x9c,0x59,0x69,0x99,0x4d,0x16,
+ }
+ },
+ { 0xa20102d, {
+ 0xf9,0x6e,0xf2,0x32,0xd3,0x0f,0x5f,0x11,
+ 0x59,0xa1,0xfe,0xcc,0xcd,0x9b,0x42,0x89,
+ 0x8b,0x89,0x2f,0xb5,0xbb,0x82,0xef,0x23,
+ 0x8c,0xe9,0x19,0x3e,0xcc,0x3f,0x7b,0xb4,
+ }
+ },
+ { 0xa201210, {
+ 0xe8,0x6d,0x51,0x6a,0x8e,0x72,0xf3,0xfe,
+ 0x6e,0x16,0xbc,0x62,0x59,0x40,0x17,0xe9,
+ 0x6d,0x3d,0x0e,0x6b,0xa7,0xac,0xe3,0x68,
+ 0xf7,0x55,0xf0,0x13,0xbb,0x22,0xf6,0x41,
+ }
+ },
+ { 0xa404107, {
+ 0xbb,0x04,0x4e,0x47,0xdd,0x5e,0x26,0x45,
+ 0x1a,0xc9,0x56,0x24,0xa4,0x4c,0x82,0xb0,
+ 0x8b,0x0d,0x9f,0xf9,0x3a,0xdf,0xc6,0x81,
+ 0x13,0xbc,0xc5,0x25,0xe4,0xc5,0xc3,0x99,
+ }
+ },
+ { 0xa500011, {
+ 0x23,0x3d,0x70,0x7d,0x03,0xc3,0xc4,0xf4,
+ 0x2b,0x82,0xc6,0x05,0xda,0x80,0x0a,0xf1,
+ 0xd7,0x5b,0x65,0x3a,0x7d,0xab,0xdf,0xa2,
+ 0x11,0x5e,0x96,0x7e,0x71,0xe9,0xfc,0x74,
+ }
+ },
+ { 0xa601209, {
+ 0x66,0x48,0xd4,0x09,0x05,0xcb,0x29,0x32,
+ 0x66,0xb7,0x9a,0x76,0xcd,0x11,0xf3,0x30,
+ 0x15,0x86,0xcc,0x5d,0x97,0x0f,0xc0,0x46,
+ 0xe8,0x73,0xe2,0xd6,0xdb,0xd2,0x77,0x1d,
+ }
+ },
+ { 0xa704107, {
+ 0xf3,0xc6,0x58,0x26,0xee,0xac,0x3f,0xd6,
+ 0xce,0xa1,0x72,0x47,0x3b,0xba,0x2b,0x93,
+ 0x2a,0xad,0x8e,0x6b,0xea,0x9b,0xb7,0xc2,
+ 0x64,0x39,0x71,0x8c,0xce,0xe7,0x41,0x39,
+ }
+ },
+ { 0xa705206, {
+ 0x8d,0xc0,0x76,0xbd,0x58,0x9f,0x8f,0xa4,
+ 0x12,0x9d,0x21,0xfb,0x48,0x21,0xbc,0xe7,
+ 0x67,0x6f,0x04,0x18,0xae,0x20,0x87,0x4b,
+ 0x03,0x35,0xe9,0xbe,0xfb,0x06,0xdf,0xfc,
+ }
+ },
+ { 0xa708007, {
+ 0x6b,0x76,0xcc,0x78,0xc5,0x8a,0xa3,0xe3,
+ 0x32,0x2d,0x79,0xe4,0xc3,0x80,0xdb,0xb2,
+ 0x07,0xaa,0x3a,0xe0,0x57,0x13,0x72,0x80,
+ 0xdf,0x92,0x73,0x84,0x87,0x3c,0x73,0x93,
+ }
+ },
+ { 0xa70c005, {
+ 0x88,0x5d,0xfb,0x79,0x64,0xd8,0x46,0x3b,
+ 0x4a,0x83,0x8e,0x77,0x7e,0xcf,0xb3,0x0f,
+ 0x1f,0x1f,0xf1,0x97,0xeb,0xfe,0x56,0x55,
+ 0xee,0x49,0xac,0xe1,0x8b,0x13,0xc5,0x13,
+ }
+ },
+ { 0xaa00116, {
+ 0xe8,0x4c,0x2c,0x88,0xa1,0xac,0x24,0x63,
+ 0x65,0xe5,0xaa,0x2d,0x16,0xa9,0xc3,0xf5,
+ 0xfe,0x1d,0x5e,0x65,0xc7,0xaa,0x92,0x4d,
+ 0x91,0xee,0x76,0xbb,0x4c,0x66,0x78,0xc9,
+ }
+ },
+ { 0xaa00212, {
+ 0xbd,0x57,0x5d,0x0a,0x0a,0x30,0xc1,0x75,
+ 0x95,0x58,0x5e,0x93,0x02,0x28,0x43,0x71,
+ 0xed,0x42,0x29,0xc8,0xec,0x34,0x2b,0xb2,
+ 0x1a,0x65,0x4b,0xfe,0x07,0x0f,0x34,0xa1,
+ }
+ },
+ { 0xaa00213, {
+ 0xed,0x58,0xb7,0x76,0x81,0x7f,0xd9,0x3a,
+ 0x1a,0xff,0x8b,0x34,0xb8,0x4a,0x99,0x0f,
+ 0x28,0x49,0x6c,0x56,0x2b,0xdc,0xb7,0xed,
+ 0x96,0xd5,0x9d,0xc1,0x7a,0xd4,0x51,0x9b,
+ }
+ },
+ { 0xaa00215, {
+ 0x55,0xd3,0x28,0xcb,0x87,0xa9,0x32,0xe9,
+ 0x4e,0x85,0x4b,0x7c,0x6b,0xd5,0x7c,0xd4,
+ 0x1b,0x51,0x71,0x3a,0x0e,0x0b,0xdc,0x9b,
+ 0x68,0x2f,0x46,0xee,0xfe,0xc6,0x6d,0xef,
+ }
+ },
+};
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 232026a239a6..b3658d11e7b6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -60,11 +60,6 @@ module_param(force_minrev, bool, S_IRUSR | S_IWUSR);
*/
struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
-struct cpu_info_ctx {
- struct cpu_signature *cpu_sig;
- int err;
-};
-
/*
* Those patch levels cannot be updated to newer ones and thus should be final.
*/
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 070426b9895f..f3d534807d91 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -21,7 +21,7 @@
#include <linux/uio.h>
#include <linux/mm.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/setup.h>
@@ -319,12 +319,6 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci,
return UCODE_OK;
}
- /*
- * Writeback and invalidate caches before updating microcode to avoid
- * internal issues depending on what the microcode is updating.
- */
- native_wbinvd();
-
/* write microcode via MSR 0x79 */
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
@@ -370,14 +364,14 @@ static __init struct microcode_intel *get_microcode_blob(struct ucode_cpu_info *
{
struct cpio_data cp;
+ intel_collect_cpu_info(&uci->cpu_sig);
+
if (!load_builtin_intel_microcode(&cp))
cp = find_microcode_in_initrd(ucode_path);
if (!(cp.data && cp.size))
return NULL;
- intel_collect_cpu_info(&uci->cpu_sig);
-
return scan_microcode(cp.data, cp.size, uci, save);
}
@@ -410,13 +404,13 @@ void __init load_ucode_intel_bsp(struct early_load_data *ed)
{
struct ucode_cpu_info uci;
- ed->old_rev = intel_get_microcode_revision();
-
uci.mc = get_microcode_blob(&uci, false);
- if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED)
- ucode_patch_va = UCODE_BSP_LOADED;
+ ed->old_rev = uci.cpu_sig.rev;
- ed->new_rev = uci.cpu_sig.rev;
+ if (uci.mc && apply_microcode_early(&uci) == UCODE_UPDATED) {
+ ucode_patch_va = UCODE_BSP_LOADED;
+ ed->new_rev = uci.cpu_sig.rev;
+ }
}
void load_ucode_intel_ap(void)
@@ -457,12 +451,6 @@ static enum ucode_state apply_microcode_late(int cpu)
if (ret != UCODE_UPDATED && ret != UCODE_OK)
return ret;
- if (!cpu && uci->cpu_sig.rev != cur_rev) {
- pr_info("Updated to revision 0x%x, date = %04x-%02x-%02x\n",
- uci->cpu_sig.rev, mc->hdr.date & 0xffff, mc->hdr.date >> 24,
- (mc->hdr.date >> 16) & 0xff);
- }
-
cpu_data(cpu).microcode = uci->cpu_sig.rev;
if (!cpu)
boot_cpu_data.microcode = uci->cpu_sig.rev;
@@ -580,15 +568,14 @@ static bool is_blacklisted(unsigned int cpu)
/*
* Late loading on model 79 with microcode revision less than 0x0b000021
* and LLC size per core bigger than 2.5MB may result in a system hang.
- * This behavior is documented in item BDF90, #334165 (Intel Xeon
+ * This behavior is documented in item BDX90, #334165 (Intel Xeon
* Processor E7-8800/4800 v4 Product Family).
*/
- if (c->x86 == 6 &&
- c->x86_model == INTEL_FAM6_BROADWELL_X &&
+ if (c->x86_vfm == INTEL_BROADWELL_X &&
c->x86_stepping == 0x01 &&
llc_size_per_core > 2621440 &&
c->microcode < 0x0b000021) {
- pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+ pr_err_once("Erratum BDX90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
return true;
}
@@ -647,7 +634,7 @@ static __init void calc_llc_size_per_core(struct cpuinfo_x86 *c)
{
u64 llc_size = c->x86_cache_size * 1024ULL;
- do_div(llc_size, c->x86_max_cores);
+ do_div(llc_size, topology_num_cores_per_package());
llc_size_per_core = (unsigned int)llc_size;
}
diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h
index 21776c529fa9..5df621752fef 100644
--- a/arch/x86/kernel/cpu/microcode/internal.h
+++ b/arch/x86/kernel/cpu/microcode/internal.h
@@ -100,14 +100,12 @@ extern bool force_minrev;
#ifdef CONFIG_CPU_SUP_AMD
void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family);
void load_ucode_amd_ap(unsigned int family);
-int save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(unsigned int cpu);
struct microcode_ops *init_amd_microcode(void);
void exit_amd_microcode(void);
#else /* CONFIG_CPU_SUP_AMD */
static inline void load_ucode_amd_bsp(struct early_load_data *ed, unsigned int family) { }
static inline void load_ucode_amd_ap(unsigned int family) { }
-static inline int save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
static inline void reload_ucode_amd(unsigned int cpu) { }
static inline struct microcode_ops *init_amd_microcode(void) { return NULL; }
static inline void exit_amd_microcode(void) { }
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 1db560ed2ca3..68f537347466 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -30,8 +30,7 @@ dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
- [ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" = '""' ] && continue
+ [ ! "$VALUE" ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 01fa06dd06b6..f285757618fc 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -16,11 +16,10 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kexec.h>
-#include <linux/i8253.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idtentry.h>
@@ -45,70 +44,70 @@ bool hyperv_paravisor_present __ro_after_init;
EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
#if IS_ENABLED(CONFIG_HYPERV)
-static inline unsigned int hv_get_nested_reg(unsigned int reg)
+static inline unsigned int hv_get_nested_msr(unsigned int reg)
{
- if (hv_is_sint_reg(reg))
- return reg - HV_REGISTER_SINT0 + HV_REGISTER_NESTED_SINT0;
+ if (hv_is_sint_msr(reg))
+ return reg - HV_X64_MSR_SINT0 + HV_X64_MSR_NESTED_SINT0;
switch (reg) {
- case HV_REGISTER_SIMP:
- return HV_REGISTER_NESTED_SIMP;
- case HV_REGISTER_SIEFP:
- return HV_REGISTER_NESTED_SIEFP;
- case HV_REGISTER_SVERSION:
- return HV_REGISTER_NESTED_SVERSION;
- case HV_REGISTER_SCONTROL:
- return HV_REGISTER_NESTED_SCONTROL;
- case HV_REGISTER_EOM:
- return HV_REGISTER_NESTED_EOM;
+ case HV_X64_MSR_SIMP:
+ return HV_X64_MSR_NESTED_SIMP;
+ case HV_X64_MSR_SIEFP:
+ return HV_X64_MSR_NESTED_SIEFP;
+ case HV_X64_MSR_SVERSION:
+ return HV_X64_MSR_NESTED_SVERSION;
+ case HV_X64_MSR_SCONTROL:
+ return HV_X64_MSR_NESTED_SCONTROL;
+ case HV_X64_MSR_EOM:
+ return HV_X64_MSR_NESTED_EOM;
default:
return reg;
}
}
-u64 hv_get_non_nested_register(unsigned int reg)
+u64 hv_get_non_nested_msr(unsigned int reg)
{
u64 value;
- if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present)
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present)
hv_ivm_msr_read(reg, &value);
else
rdmsrl(reg, value);
return value;
}
-EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
+EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
-void hv_set_non_nested_register(unsigned int reg, u64 value)
+void hv_set_non_nested_msr(unsigned int reg, u64 value)
{
- if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) {
+ if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
hv_ivm_msr_write(reg, value);
/* Write proxy bit via wrmsl instruction */
- if (hv_is_sint_reg(reg))
+ if (hv_is_sint_msr(reg))
wrmsrl(reg, value | 1 << 20);
} else {
wrmsrl(reg, value);
}
}
-EXPORT_SYMBOL_GPL(hv_set_non_nested_register);
+EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
-u64 hv_get_register(unsigned int reg)
+u64 hv_get_msr(unsigned int reg)
{
if (hv_nested)
- reg = hv_get_nested_reg(reg);
+ reg = hv_get_nested_msr(reg);
- return hv_get_non_nested_register(reg);
+ return hv_get_non_nested_msr(reg);
}
-EXPORT_SYMBOL_GPL(hv_get_register);
+EXPORT_SYMBOL_GPL(hv_get_msr);
-void hv_set_register(unsigned int reg, u64 value)
+void hv_set_msr(unsigned int reg, u64 value)
{
if (hv_nested)
- reg = hv_get_nested_reg(reg);
+ reg = hv_get_nested_msr(reg);
- hv_set_non_nested_register(reg, value);
+ hv_set_non_nested_msr(reg, value);
}
-EXPORT_SYMBOL_GPL(hv_set_register);
+EXPORT_SYMBOL_GPL(hv_set_msr);
static void (*vmbus_handler)(void);
static void (*hv_stimer0_handler)(void);
@@ -199,8 +198,8 @@ static void hv_machine_shutdown(void)
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
- if (kexec_in_progress && hyperv_init_cpuhp > 0)
- cpuhp_remove_state(hyperv_init_cpuhp);
+ if (kexec_in_progress)
+ cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
@@ -209,7 +208,9 @@ static void hv_machine_shutdown(void)
if (kexec_in_progress)
hyperv_cleanup();
}
+#endif /* CONFIG_KEXEC_CORE */
+#ifdef CONFIG_CRASH_DUMP
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
@@ -221,7 +222,64 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
/* Disable the hypercall page when there is only 1 active CPU. */
hyperv_cleanup();
}
-#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -350,13 +408,24 @@ static void __init reduced_hw_init(void)
x86_init.irqs.pre_vector_init = x86_init_noop;
}
+int hv_get_hypervisor_version(union hv_hypervisor_version_info *info)
+{
+ unsigned int hv_max_functions;
+
+ hv_max_functions = cpuid_eax(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS);
+ if (hv_max_functions < HYPERV_CPUID_VERSION) {
+ pr_err("%s: Could not detect Hyper-V version\n", __func__);
+ return -ENODEV;
+ }
+
+ cpuid(HYPERV_CPUID_VERSION, &info->eax, &info->ebx, &info->ecx, &info->edx);
+
+ return 0;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
- int hv_host_info_eax;
- int hv_host_info_ebx;
- int hv_host_info_ecx;
- int hv_host_info_edx;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -407,25 +476,11 @@ static void __init ms_hyperv_init_platform(void)
pr_info("Hyper-V: running on a nested hypervisor\n");
}
- /*
- * Extract host information.
- */
- if (hv_max_functions_eax >= HYPERV_CPUID_VERSION) {
- hv_host_info_eax = cpuid_eax(HYPERV_CPUID_VERSION);
- hv_host_info_ebx = cpuid_ebx(HYPERV_CPUID_VERSION);
- hv_host_info_ecx = cpuid_ecx(HYPERV_CPUID_VERSION);
- hv_host_info_edx = cpuid_edx(HYPERV_CPUID_VERSION);
-
- pr_info("Hyper-V: Host Build %d.%d.%d.%d-%d-%d\n",
- hv_host_info_ebx >> 16, hv_host_info_ebx & 0xFFFF,
- hv_host_info_eax, hv_host_info_edx & 0xFFFFFF,
- hv_host_info_ecx, hv_host_info_edx >> 24);
- }
-
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
x86_platform.calibrate_cpu = hv_get_tsc_khz;
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
}
if (ms_hyperv.priv_high & HV_ISOLATION) {
@@ -451,10 +506,24 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
if (!ms_hyperv.paravisor_present) {
- /* To be supported: more work is required. */
+ /*
+ * Mark the Hyper-V TSC page feature as disabled
+ * in a TDX VM without paravisor so that the
+ * Invariant TSC, which is a better clocksource
+ * anyway, is used instead.
+ */
ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
- /* HV_REGISTER_CRASH_CTL is unsupported. */
+ /*
+ * The Invariant TSC is expected to be available
+ * in a TDX VM without paravisor, but if not,
+ * print a warning message. The slower Hyper-V MSR-based
+ * Ref Counter should end up being the clocksource.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ pr_warn("Hyper-V: Invariant TSC is unavailable\n");
+
+ /* HV_MSR_CRASH_CTL is unsupported. */
ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
/* Don't trust Hyper-V's TLB-flushing hypercalls. */
@@ -495,10 +564,14 @@ static void __init ms_hyperv_init_platform(void)
no_timer_check = 1;
#endif
-#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+#if IS_ENABLED(CONFIG_HYPERV)
+#if defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
+#endif
+#if defined(CONFIG_CRASH_DUMP)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
+#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
/*
* Writing to synthetic MSR 0x40000118 updates/changes the
@@ -520,16 +593,6 @@ static void __init ms_hyperv_init_platform(void)
if (efi_enabled(EFI_BOOT))
x86_platform.get_nmi_reason = hv_get_nmi_reason;
- /*
- * Hyper-V VMs have a PIT emulation quirk such that zeroing the
- * counter register during PIT shutdown restarts the PIT. So it
- * continues to interrupt @18.2 HZ. Setting i8253_clear_counter
- * to false tells pit_shutdown() not to zero the counter so that
- * the PIT really is shutdown. Generation 2 VMs don't have a PIT,
- * and setting this value has no effect.
- */
- i8253_clear_counter_on_shutdown = false;
-
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
ms_hyperv.paravisor_present)
@@ -539,19 +602,18 @@ static void __init ms_hyperv_init_platform(void)
*/
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
- /* Setup the IDT for hypervisor callback */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
- /* Setup the IDT for reenlightenment notifications */
+ /* Install system interrupt handler for hypervisor callback */
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback);
+
+ /* Install system interrupt handler for reenlightenment notifications */
if (ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT) {
- alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
- asm_sysvec_hyperv_reenlightenment);
+ sysvec_install(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment);
}
- /* Setup the IDT for stimer0 */
+ /* Install system interrupt handler for stimer0 */
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
- alloc_intr_gate(HYPERV_STIMER0_VECTOR,
- asm_sysvec_hyperv_stimer0);
+ sysvec_install(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0);
}
# ifdef CONFIG_SMP
@@ -574,6 +636,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
@@ -643,6 +706,7 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+ .init.guest_late_init = ms_hyperv_late_init,
#ifdef CONFIG_AMD_MEM_ENCRYPT
.runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
.runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 2d6aa5d2e3d7..2fdfda2b60e4 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -108,6 +108,9 @@ static inline void k8_check_syscfg_dram_mod_en(void)
(boot_cpu_data.x86 >= 0x0f)))
return;
+ if (cc_platform_has(CC_ATTR_HOST_SEV_SNP))
+ return;
+
rdmsr(MSR_AMD64_SYSCFG, lo, hi);
if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) {
pr_err(FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]"
@@ -420,7 +423,7 @@ void __init mtrr_copy_map(void)
}
/**
- * mtrr_overwrite_state - set static MTRR state
+ * guest_force_mtrr_state - set static MTRR state for a guest
*
* Used to set MTRR state via different means (e.g. with data obtained from
* a hypervisor).
@@ -428,9 +431,13 @@ void __init mtrr_copy_map(void)
* from the x86_init.hyper.init_platform() hook. It can be called only once.
* The MTRR state can't be changed afterwards. To ensure that, X86_FEATURE_MTRR
* is cleared.
+ *
+ * @var: MTRR variable range array to use
+ * @num_var: length of the @var array
+ * @def_type: default caching type
*/
-void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
- mtrr_type def_type)
+void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
+ mtrr_type def_type)
{
unsigned int i;
@@ -492,13 +499,15 @@ static u8 type_merge(u8 type, u8 new_type, u8 *uniform)
/**
* mtrr_type_lookup - look up memory type in MTRR
*
+ * @start: Begin of the physical address range
+ * @end: End of the physical address range
+ * @uniform: output argument:
+ * - 1: the returned MTRR type is valid for the whole region
+ * - 0: otherwise
+ *
* Return Values:
* MTRR_TYPE_(type) - The effective MTRR type for the region
* MTRR_TYPE_INVALID - MTRR is disabled
- *
- * Output Argument:
- * uniform - Set to 1 when the returned MTRR type is valid for the whole
- * region, set to 0 else.
*/
u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 767bf1c71aad..ecbda0341a8a 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -55,6 +55,12 @@
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
@@ -609,7 +615,7 @@ void mtrr_save_state(void)
{
int first_cpu;
- if (!mtrr_enabled())
+ if (!mtrr_enabled() || !mtrr_state.have_fixed)
return;
first_cpu = cpumask_first(cpu_online_mask);
@@ -619,7 +625,7 @@ void mtrr_save_state(void)
static int __init mtrr_init_finalize(void)
{
/*
- * Map might exist if mtrr_overwrite_state() has been called or if
+ * Map might exist if guest_force_mtrr_state() has been called or if
* mtrr_enabled() returns true.
*/
mtrr_copy_map();
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e65fae63660e..41ed01f46bd9 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -41,11 +41,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
"fpu_exception\t: %s\n"
"cpuid level\t: %d\n"
"wp\t\t: yes\n",
- boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no",
- boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
- boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no",
+ str_yes_no(boot_cpu_has_bug(X86_BUG_FDIV)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_F00F)),
+ str_yes_no(boot_cpu_has_bug(X86_BUG_COMA)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
+ str_yes_no(boot_cpu_has(X86_FEATURE_FPU)),
c->cpuid_level);
}
#else
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 26a427fa84ea..eeac00d20926 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -6,6 +6,7 @@
* Authors: Fenghua Yu <fenghua.yu@intel.com>,
* H. Peter Anvin <hpa@linux.intel.com>
*/
+#include <linux/printk.h>
#include <asm/processor.h>
#include <asm/archrandom.h>
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 19e0681f0435..3d1735ed8d1f 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -16,17 +16,24 @@
#define pr_fmt(fmt) "resctrl: " fmt
+#include <linux/cpu.h>
#include <linux/slab.h>
#include <linux/err.h>
-#include <linux/cacheinfo.h>
#include <linux/cpuhotplug.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include "internal.h"
-/* Mutex to protect rdtgroup access. */
-DEFINE_MUTEX(rdtgroup_mutex);
+/*
+ * rdt_domain structures are kfree()d when their last CPU goes offline,
+ * and allocated when the first CPU in a new domain comes online.
+ * The rdt_resource's domain list is updated when this happens. Readers of
+ * the domain list must either take cpus_read_lock(), or rely on an RCU
+ * read-side critical section, to avoid observing concurrent modification.
+ * All writers take this mutex:
+ */
+static DEFINE_MUTEX(domain_list_lock);
/*
* The cached resctrl_pqr_state is strictly per CPU and can never be
@@ -48,16 +55,12 @@ int max_name_width, max_data_width;
*/
bool rdt_alloc_capable;
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+static void mba_wrmsr_intel(struct msr_param *m);
+static void cat_wrmsr(struct msr_param *m);
+static void mba_wrmsr_amd(struct msr_param *m);
-#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.domains)
+#define ctrl_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.ctrl_domains)
+#define mon_domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].r_resctrl.mon_domains)
struct rdt_hw_resource rdt_resources_all[] = {
[RDT_RESOURCE_L3] =
@@ -65,8 +68,10 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_L3,
.name = "L3",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_L3),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .mon_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L3),
+ .mon_domains = mon_domain_init(RDT_RESOURCE_L3),
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
@@ -79,8 +84,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_L2,
.name = "L2",
- .cache_level = 2,
- .domains = domain_init(RDT_RESOURCE_L2),
+ .ctrl_scope = RESCTRL_L2_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2),
.parse_ctrlval = parse_cbm,
.format_str = "%d=%0*x",
.fflags = RFTYPE_RES_CACHE,
@@ -93,8 +98,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_MBA,
.name = "MB",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_MBA),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA),
.parse_ctrlval = parse_bw,
.format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
@@ -105,8 +110,8 @@ struct rdt_hw_resource rdt_resources_all[] = {
.r_resctrl = {
.rid = RDT_RESOURCE_SMBA,
.name = "SMBA",
- .cache_level = 3,
- .domains = domain_init(RDT_RESOURCE_SMBA),
+ .ctrl_scope = RESCTRL_L3_CACHE,
+ .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA),
.parse_ctrlval = parse_bw,
.format_str = "%d=%*u",
.fflags = RFTYPE_RES_MB,
@@ -114,6 +119,14 @@ struct rdt_hw_resource rdt_resources_all[] = {
},
};
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
+ return r->num_rmid;
+}
+
/*
* cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
* as they do not have CPUID enumeration support for Cache allocation.
@@ -136,15 +149,15 @@ static inline void cache_alloc_hsw_probe(void)
{
struct rdt_hw_resource *hw_res = &rdt_resources_all[RDT_RESOURCE_L3];
struct rdt_resource *r = &hw_res->r_resctrl;
- u32 l, h, max_cbm = BIT_MASK(20) - 1;
+ u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0;
- if (wrmsr_safe(MSR_IA32_L3_CBM_BASE, max_cbm, 0))
+ if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm))
return;
- rdmsr(MSR_IA32_L3_CBM_BASE, l, h);
+ rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0);
/* If all the bits were set in MSR, return success */
- if (l != max_cbm)
+ if (l3_cbm_0 != max_cbm)
return;
hw_res->num_closid = 4;
@@ -194,7 +207,7 @@ static inline bool rdt_get_mb_table(struct rdt_resource *r)
return false;
}
-static bool __get_mem_config_intel(struct rdt_resource *r)
+static __init bool __get_mem_config_intel(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
union cpuid_0x10_3_eax eax;
@@ -221,19 +234,19 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
else
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
- thread_throttle_mode_init();
+
+ resctrl_file_fflags_init("thread_throttle_mode",
+ RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
r->alloc_capable = true;
return true;
}
-static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
+static __init bool __rdt_get_mem_config_amd(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- union cpuid_0x10_3_eax eax;
- union cpuid_0x10_x_edx edx;
- u32 ebx, ecx, subleaf;
+ u32 eax, ebx, ecx, edx, subleaf;
/*
* Query CPUID_Fn80000020_EDX_x01 for MBA and
@@ -241,9 +254,9 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
*/
subleaf = (r->rid == RDT_RESOURCE_SMBA) ? 2 : 1;
- cpuid_count(0x80000020, subleaf, &eax.full, &ebx, &ecx, &edx.full);
- hw_res->num_closid = edx.split.cos_max + 1;
- r->default_ctrl = MAX_MBA_BW_AMD;
+ cpuid_count(0x80000020, subleaf, &eax, &ebx, &ecx, &edx);
+ hw_res->num_closid = edx + 1;
+ r->default_ctrl = 1 << eax;
/* AMD does not use delay */
r->membw.delay_linear = false;
@@ -303,12 +316,11 @@ static void rdt_get_cdp_l2_config(void)
rdt_get_cdp_config(RDT_RESOURCE_L2);
}
-static void
-mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void mba_wrmsr_amd(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
@@ -328,37 +340,51 @@ static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
return r->default_ctrl;
}
-static void
-mba_wrmsr_intel(struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r)
+static void mba_wrmsr_intel(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
/* Write the delay values for mba. */
for (i = m->low; i < m->high; i++)
- wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], r));
+ wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res));
}
-static void
-cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
+static void cat_wrmsr(struct msr_param *m)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(m->dom);
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
unsigned int i;
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
for (i = m->low; i < m->high; i++)
wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]);
}
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
+struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r)
{
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
- list_for_each_entry(d, &r->domains, list) {
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
/* Find the domain that contains this CPU */
- if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
return d;
}
@@ -372,40 +398,29 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r)
void rdt_ctrl_update(void *arg)
{
+ struct rdt_hw_resource *hw_res;
struct msr_param *m = arg;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(m->res);
- struct rdt_resource *r = m->res;
- int cpu = smp_processor_id();
- struct rdt_domain *d;
- d = get_domain_from_cpu(cpu, r);
- if (d) {
- hw_res->msr_update(d, m, r);
- return;
- }
- pr_warn_once("cpu %d not found in any domain for resource %s\n",
- cpu, r->name);
+ hw_res = resctrl_to_arch_res(m->res);
+ hw_res->msr_update(m);
}
/*
- * rdt_find_domain - Find a domain in a resource that matches input resource id
+ * rdt_find_domain - Search for a domain id in a resource domain list.
*
- * Search resource r's domain list to find the resource id. If the resource
- * id is found in a domain, return the domain. Otherwise, if requested by
- * caller, return the first domain whose id is bigger than the input id.
- * The domain list is sorted by id in ascending order.
+ * Search the domain list to find the domain id. If the domain id is
+ * found, return the domain. NULL otherwise. If the domain id is not
+ * found (and NULL returned) then the first domain with id bigger than
+ * the input id can be returned to the caller via @pos.
*/
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos)
+struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
{
- struct rdt_domain *d;
+ struct rdt_domain_hdr *d;
struct list_head *l;
- if (id < 0)
- return ERR_PTR(-ENODEV);
-
- list_for_each(l, &r->domains) {
- d = list_entry(l, struct rdt_domain, list);
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
/* When id is found, return its domain. */
if (id == d->id)
return d;
@@ -434,18 +449,23 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
*dc = r->default_ctrl;
}
-static void domain_free(struct rdt_hw_domain *hw_dom)
+static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom)
+{
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
+}
+
+static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom)
{
kfree(hw_dom->arch_mbm_total);
kfree(hw_dom->arch_mbm_local);
- kfree(hw_dom->ctrl_val);
kfree(hw_dom);
}
-static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
u32 *dc;
@@ -457,9 +477,11 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
hw_dom->ctrl_val = dc;
setup_default_ctrlval(r, dc);
+ m.res = r;
+ m.dom = d;
m.low = 0;
m.high = hw_res->num_closid;
- hw_res->msr_update(d, &m, r);
+ hw_res->msr_update(&m);
return 0;
}
@@ -468,7 +490,7 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
* @num_rmid: The size of the MBM counter array
* @hw_dom: The domain that owns the allocated arrays
*/
-static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom)
{
size_t tsize;
@@ -491,35 +513,45 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
return 0;
}
-/*
- * domain_add_cpu - Add a cpu to a resource's domain list.
- *
- * If an existing domain in the resource r's domain list matches the cpu's
- * resource id, add the cpu in the domain.
- *
- * Otherwise, a new domain is allocated and inserted into the right position
- * in the domain list sorted by id in ascending order.
- *
- * The order in the domain list is visible to users when we print entries
- * in the schemata file and schemata input is validated to have the same order
- * as this list.
- */
-static void domain_add_cpu(int cpu, struct rdt_resource *r)
+static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope)
+{
+ switch (scope) {
+ case RESCTRL_L2_CACHE:
+ case RESCTRL_L3_CACHE:
+ return get_cpu_cacheinfo_id(cpu, scope);
+ case RESCTRL_L3_NODE:
+ return cpu_to_node(cpu);
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
+static void domain_add_cpu_ctrl(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
struct list_head *add_pos = NULL;
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
int err;
- d = rdt_find_domain(r, id, &add_pos);
- if (IS_ERR(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
return;
}
- if (d) {
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ hdr = rdt_find_domain(&r->ctrl_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
if (r->cache.arch_has_per_cpu_cfg)
rdt_domain_reconfigure_cdp(r);
return;
@@ -530,125 +562,226 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
return;
d = &hw_dom->d_resctrl;
- d->id = id;
- cpumask_set_cpu(cpu, &d->cpu_mask);
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
rdt_domain_reconfigure_cdp(r);
- if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- domain_free(hw_dom);
+ if (domain_setup_ctrlval(r, d)) {
+ ctrl_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_ctrl_domain(r, d);
+ if (err) {
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ ctrl_domain_free(hw_dom);
+ }
+}
+
+static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct list_head *add_pos = NULL;
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+ int err;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
return;
}
- if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
- domain_free(hw_dom);
+ hdr = rdt_find_domain(&r->mon_domains, id, &add_pos);
+ if (hdr) {
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
return;
}
- list_add_tail(&d->list, add_pos);
+ hw_dom = kzalloc_node(sizeof(*hw_dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!hw_dom)
+ return;
+
+ d = &hw_dom->d_resctrl;
+ d->hdr.id = id;
+ d->hdr.type = RESCTRL_MON_DOMAIN;
+ d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!d->ci) {
+ pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
+ mon_domain_free(hw_dom);
+ return;
+ }
+ cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
- err = resctrl_online_domain(r, d);
+ arch_mon_domain_online(r, d);
+
+ if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+ mon_domain_free(hw_dom);
+ return;
+ }
+
+ list_add_tail_rcu(&d->hdr.list, add_pos);
+
+ err = resctrl_online_mon_domain(r, d);
if (err) {
- list_del(&d->list);
- domain_free(hw_dom);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
}
}
-static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
{
- int id = get_cpu_cacheinfo_id(cpu, r->cache_level);
- struct rdt_hw_domain *hw_dom;
- struct rdt_domain *d;
+ if (r->alloc_capable)
+ domain_add_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_add_cpu_mon(cpu, r);
+}
- d = rdt_find_domain(r, id, NULL);
- if (IS_ERR_OR_NULL(d)) {
- pr_warn("Couldn't find cache id for CPU %d\n", cpu);
+static void domain_remove_cpu_ctrl(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->ctrl_scope);
+ struct rdt_hw_ctrl_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find control domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->ctrl_scope, r->name);
+ return;
+ }
+
+ hdr = rdt_find_domain(&r->ctrl_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find control domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
return;
}
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_clear_cpu(cpu, &d->cpu_mask);
- if (cpumask_empty(&d->cpu_mask)) {
- resctrl_offline_domain(r, d);
- list_del(&d->list);
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_CTRL_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_ctrl_domain, hdr);
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_ctrl_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
/*
- * rdt_domain "d" is going to be freed below, so clear
+ * rdt_ctrl_domain "d" is going to be freed below, so clear
* its pointer from pseudo_lock_region struct.
*/
if (d->plr)
d->plr->d = NULL;
- domain_free(hw_dom);
+ ctrl_domain_free(hw_dom);
return;
}
+}
- if (r == &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl) {
- if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
- cancel_delayed_work(&d->mbm_over);
- mbm_setup_overflow_handler(d, 0);
- }
- if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
- has_busy_rmid(r, d)) {
- cancel_delayed_work(&d->cqm_limbo);
- cqm_setup_limbo_handler(d, 0);
- }
+static void domain_remove_cpu_mon(int cpu, struct rdt_resource *r)
+{
+ int id = get_domain_id_from_scope(cpu, r->mon_scope);
+ struct rdt_hw_mon_domain *hw_dom;
+ struct rdt_domain_hdr *hdr;
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ if (id < 0) {
+ pr_warn_once("Can't find monitor domain id for CPU:%d scope:%d for resource %s\n",
+ cpu, r->mon_scope, r->name);
+ return;
}
+
+ hdr = rdt_find_domain(&r->mon_domains, id, NULL);
+ if (!hdr) {
+ pr_warn("Can't find monitor domain for id=%d for CPU %d for resource %s\n",
+ id, cpu, r->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN))
+ return;
+
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ hw_dom = resctrl_to_arch_mon_dom(d);
+
+ cpumask_clear_cpu(cpu, &d->hdr.cpu_mask);
+ if (cpumask_empty(&d->hdr.cpu_mask)) {
+ resctrl_offline_mon_domain(r, d);
+ list_del_rcu(&d->hdr.list);
+ synchronize_rcu();
+ mon_domain_free(hw_dom);
+
+ return;
+ }
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ if (r->alloc_capable)
+ domain_remove_cpu_ctrl(cpu, r);
+ if (r->mon_capable)
+ domain_remove_cpu_mon(cpu, r);
}
static void clear_closid_rmid(int cpu)
{
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
- state->default_closid = 0;
- state->default_rmid = 0;
- state->cur_closid = 0;
- state->cur_rmid = 0;
- wrmsr(MSR_IA32_PQR_ASSOC, 0, 0);
+ state->default_closid = RESCTRL_RESERVED_CLOSID;
+ state->default_rmid = RESCTRL_RESERVED_RMID;
+ state->cur_closid = RESCTRL_RESERVED_CLOSID;
+ state->cur_rmid = RESCTRL_RESERVED_RMID;
+ wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
+ RESCTRL_RESERVED_CLOSID);
}
-static int resctrl_online_cpu(unsigned int cpu)
+static int resctrl_arch_online_cpu(unsigned int cpu)
{
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_add_cpu(cpu, r);
- /* The cpu is set in default rdtgroup after online. */
- cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
+ resctrl_online_cpu(cpu);
return 0;
}
-static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
-{
- struct rdtgroup *cr;
-
- list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
- if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
- break;
- }
- }
-}
-
-static int resctrl_offline_cpu(unsigned int cpu)
+static int resctrl_arch_offline_cpu(unsigned int cpu)
{
- struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- mutex_lock(&rdtgroup_mutex);
+ resctrl_offline_cpu(cpu);
+
+ mutex_lock(&domain_list_lock);
for_each_capable_rdt_resource(r)
domain_remove_cpu(cpu, r);
- list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
- if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
- clear_childcpus(rdtgrp, cpu);
- break;
- }
- }
+ mutex_unlock(&domain_list_lock);
+
clear_closid_rmid(cpu);
- mutex_unlock(&rdtgroup_mutex);
return 0;
}
@@ -830,23 +963,28 @@ static __init bool get_rdt_mon_resources(void)
if (!rdt_mon_features)
return false;
+ if (is_mbm_local_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else if (is_mbm_total_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
return !rdt_get_mon_l3_config(r);
}
static __init void __check_quirks_intel(void)
{
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_HASWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_HASWELL_X:
if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
cache_alloc_hsw_probe();
break;
- case INTEL_FAM6_SKYLAKE_X:
+ case INTEL_SKYLAKE_X:
if (boot_cpu_data.x86_stepping <= 4)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
fallthrough;
- case INTEL_FAM6_BROADWELL_X:
+ case INTEL_BROADWELL_X:
intel_rdt_mbm_apply_quirk();
break;
}
@@ -968,7 +1106,8 @@ static int __init resctrl_late_init(void)
state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"x86/resctrl/cat:online:",
- resctrl_online_cpu, resctrl_offline_cpu);
+ resctrl_arch_online_cpu,
+ resctrl_arch_offline_cpu);
if (state < 0)
return state;
@@ -992,8 +1131,14 @@ late_initcall(resctrl_late_init);
static void __exit resctrl_exit(void)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
cpuhp_remove_state(rdt_online);
+
rdtgroup_exit();
+
+ if (r->mon_capable)
+ rdt_put_mon_l3_config();
}
__exitcall(resctrl_exit);
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index beccb0e87ba7..536351159cc2 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -19,6 +19,8 @@
#include <linux/kernfs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
+#include <linux/tick.h>
+
#include "internal.h"
/*
@@ -27,10 +29,10 @@
* hardware. The allocated bandwidth percentage is rounded to the next
* control step available on the hardware.
*/
-static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
+static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
{
- unsigned long bw;
int ret;
+ u32 bw;
/*
* Only linear delay values is supported for current Intel SKUs.
@@ -40,16 +42,21 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return false;
}
- ret = kstrtoul(buf, 10, &bw);
+ ret = kstrtou32(buf, 10, &bw);
if (ret) {
- rdt_last_cmd_printf("Non-decimal digit in MB value %s\n", buf);
+ rdt_last_cmd_printf("Invalid MB value %s\n", buf);
return false;
}
- if ((bw < r->membw.min_bw || bw > r->default_ctrl) &&
- !is_mba_sc(r)) {
- rdt_last_cmd_printf("MB value %ld out of range [%d,%d]\n", bw,
- r->membw.min_bw, r->default_ctrl);
+ /* Nothing else to do if software controller is enabled. */
+ if (is_mba_sc(r)) {
+ *data = bw;
+ return true;
+ }
+
+ if (bw < r->membw.min_bw || bw > r->default_ctrl) {
+ rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
+ bw, r->membw.min_bw, r->default_ctrl);
return false;
}
@@ -58,16 +65,16 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
}
int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
struct resctrl_staged_config *cfg;
u32 closid = data->rdtgrp->closid;
struct rdt_resource *r = s->res;
- unsigned long bw_val;
+ u32 bw_val;
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -137,7 +144,7 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
* resource type.
*/
int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
struct rdtgroup *rdtgrp = data->rdtgrp;
struct resctrl_staged_config *cfg;
@@ -146,7 +153,7 @@ int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
cfg = &d->staged_config[s->conf_type];
if (cfg->have_new_ctrl) {
- rdt_last_cmd_printf("Duplicate domain %d\n", d->id);
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
return -EINVAL;
}
@@ -206,10 +213,13 @@ static int parse_line(char *line, struct resctrl_schema *s,
struct resctrl_staged_config *cfg;
struct rdt_resource *r = s->res;
struct rdt_parse_data data;
+ struct rdt_ctrl_domain *d;
char *dom = NULL, *id;
- struct rdt_domain *d;
unsigned long dom_id;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
(r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
@@ -226,8 +236,8 @@ next:
return -EINVAL;
}
dom = strim(dom);
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
data.buf = dom;
data.rdtgrp = rdtgrp;
if (r->parse_ctrlval(&data, s, d))
@@ -267,39 +277,24 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
}
}
-static bool apply_config(struct rdt_hw_domain *hw_dom,
- struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask)
-{
- struct rdt_domain *dom = &hw_dom->d_resctrl;
-
- if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
- cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- hw_dom->ctrl_val[idx] = cfg->new_ctrl;
-
- return true;
- }
-
- return false;
-}
-
-int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type t, u32 cfg_val)
{
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
u32 idx = get_config_index(closid, t);
struct msr_param msr_param;
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ if (!cpumask_test_cpu(smp_processor_id(), &d->hdr.cpu_mask))
return -EINVAL;
hw_dom->ctrl_val[idx] = cfg_val;
msr_param.res = r;
+ msr_param.dom = d;
msr_param.low = idx;
msr_param.high = idx + 1;
- hw_res->msr_update(d, &msr_param, r);
+ hw_res->msr_update(&msr_param);
return 0;
}
@@ -307,48 +302,42 @@ int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
+ struct rdt_ctrl_domain *d;
enum resctrl_conf_type t;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
u32 idx;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
- msr_param.res = NULL;
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
+ msr_param.res = NULL;
for (t = 0; t < CDP_NUM_TYPES; t++) {
cfg = &hw_dom->d_resctrl.staged_config[t];
if (!cfg->have_new_ctrl)
continue;
idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask))
+ if (cfg->new_ctrl == hw_dom->ctrl_val[idx])
continue;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
if (!msr_param.res) {
msr_param.low = idx;
msr_param.high = msr_param.low + 1;
msr_param.res = r;
+ msr_param.dom = d;
} else {
msr_param.low = min(msr_param.low, idx);
msr_param.high = max(msr_param.high, idx + 1);
}
}
+ if (msr_param.res)
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- if (cpumask_empty(cpu_mask))
- goto done;
-
- /* Update resource control msr on all the CPUs. */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
-done:
- free_cpumask_var(cpu_mask);
-
return 0;
}
@@ -379,11 +368,9 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
return -EINVAL;
buf[nbytes - 1] = '\0';
- cpus_read_lock();
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return -ENOENT;
}
rdt_last_cmd_clear();
@@ -445,14 +432,13 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
out:
rdt_staged_configs_clear();
rdtgroup_kn_unlock(of->kn);
- cpus_read_unlock();
return ret ?: nbytes;
}
-u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
u32 closid, enum resctrl_conf_type type)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_ctrl_domain *hw_dom = resctrl_to_arch_ctrl_dom(d);
u32 idx = get_config_index(closid, type);
return hw_dom->ctrl_val[idx];
@@ -461,12 +447,15 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
{
struct rdt_resource *r = schema->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
bool sep = false;
u32 ctrl_val;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
@@ -476,7 +465,7 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
ctrl_val = resctrl_arch_get_config(r, dom, closid,
schema->conf_type);
- seq_printf(s, r->format_str, dom->id, max_data_width,
+ seq_printf(s, r->format_str, dom->hdr.id, max_data_width,
ctrl_val);
sep = true;
}
@@ -505,7 +494,7 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
} else {
seq_printf(s, "%s:%d=%x\n",
rdtgrp->plr->s->res->name,
- rdtgrp->plr->d->id,
+ rdtgrp->plr->d->hdr.id,
rdtgrp->plr->cbm);
}
} else {
@@ -522,32 +511,132 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
return ret;
}
+static int smp_mon_event_count(void *arg)
+{
+ mon_event_count(arg);
+
+ return 0;
+}
+
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (!strcmp(buf, "mbm_local_bytes")) {
+ if (is_mbm_local_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else if (!strcmp(buf, "mbm_total_bytes")) {
+ if (is_mbm_total_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ switch (rdtgrp->mba_mbps_event) {
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ seq_puts(s, "mbm_local_bytes\n");
+ break;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ seq_puts(s, "mbm_total_bytes\n");
+ break;
+ default:
+ pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first)
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first)
{
+ int cpu;
+
+ /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
/*
- * setup the parameters to send to the IPI to read the data.
+ * Setup the parameters to pass to mon_event_count() to read the data.
*/
rr->rgrp = rdtgrp;
rr->evtid = evtid;
rr->r = r;
rr->d = d;
- rr->val = 0;
rr->first = first;
+ rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+ if (IS_ERR(rr->arch_mon_ctx)) {
+ rr->err = -EINVAL;
+ return;
+ }
+
+ cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
+
+ /*
+ * cpumask_any_housekeeping() prefers housekeeping CPUs, but
+ * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
+ * MPAM's resctrl_arch_rmid_read() is unable to read the
+ * counters on some platforms if its called in IRQ context.
+ */
+ if (tick_nohz_full_cpu(cpu))
+ smp_call_function_any(cpumask, mon_event_count, rr, 1);
+ else
+ smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
- smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
+ resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
}
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
+ struct rdt_domain_hdr *hdr;
+ struct rmid_read rr = {0};
+ struct rdt_mon_domain *d;
u32 resid, evtid, domid;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
union mon_data_bits md;
- struct rdt_domain *d;
- struct rmid_read rr;
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
@@ -560,15 +649,40 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
resid = md.u.rid;
domid = md.u.domid;
evtid = md.u.evtid;
-
r = &rdt_resources_all[resid].r_resctrl;
- d = rdt_find_domain(r, domid, NULL);
- if (IS_ERR_OR_NULL(d)) {
+
+ if (md.u.sum) {
+ /*
+ * This file requires summing across all domains that share
+ * the L3 cache id that was provided in the "domid" field of the
+ * mon_data_bits union. Search all domains in the resource for
+ * one that matches this cache id.
+ */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->ci->id == domid) {
+ rr.ci = d->ci;
+ mon_event_read(&rr, r, NULL, rdtgrp,
+ &d->ci->shared_cpu_map, evtid, false);
+ goto checkresult;
+ }
+ }
ret = -ENOENT;
goto out;
+ } else {
+ /*
+ * This file provides data from a single domain. Search
+ * the resource to find the domain with "domid".
+ */
+ hdr = rdt_find_domain(&r->mon_domains, domid, NULL);
+ if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
}
- mon_event_read(&rr, r, d, rdtgrp, evtid, false);
+checkresult:
if (rr.err == -EIO)
seq_puts(m, "Error\n");
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a4f1aa15f0a2..20c898f09b7e 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -7,6 +7,9 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>
#include <linux/jump_label.h>
+#include <linux/tick.h>
+
+#include <asm/resctrl.h>
#define L3_QOS_CDP_ENABLE 0x01ULL
@@ -18,7 +21,6 @@
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
-#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
#define RMID_VAL_ERROR BIT_ULL(63)
@@ -54,6 +56,47 @@
/* Max event bits supported */
#define MAX_EVT_CONFIG_BITS GENMASK(6, 0)
+/**
+ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
+ * aren't marked nohz_full
+ * @mask: The mask to pick a CPU from.
+ * @exclude_cpu:The CPU to avoid picking.
+ *
+ * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
+ * CPUs that don't use nohz_full, these are preferred. Pass
+ * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
+ *
+ * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
+ */
+static inline unsigned int
+cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
+{
+ unsigned int cpu, hk_cpu;
+
+ if (exclude_cpu == RESCTRL_PICK_ANY_CPU)
+ cpu = cpumask_any(mask);
+ else
+ cpu = cpumask_any_but(mask, exclude_cpu);
+
+ /* Only continue if tick_nohz_full_mask has been initialized. */
+ if (!tick_nohz_full_enabled())
+ return cpu;
+
+ /* If the CPU picked isn't marked nohz_full nothing more needs doing. */
+ if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu))
+ return cpu;
+
+ /* Try to find a CPU that isn't nohz_full to use in preference */
+ hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask);
+ if (hk_cpu == exclude_cpu)
+ hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask);
+
+ if (hk_cpu < nr_cpu_ids)
+ cpu = hk_cpu;
+
+ return cpu;
+}
+
struct rdt_fs_context {
struct kernfs_fs_context kfc;
bool enable_cdpl2;
@@ -69,9 +112,6 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
return container_of(kfc, struct rdt_fs_context, kfc);
}
-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
-DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
-
/**
* struct mon_evt - Entry in the event list of a resource
* @evtid: event id
@@ -87,37 +127,62 @@ struct mon_evt {
};
/**
- * union mon_data_bits - Monitoring details for each event file
+ * union mon_data_bits - Monitoring details for each event file.
* @priv: Used to store monitoring event data in @u
- * as kernfs private data
- * @rid: Resource id associated with the event file
- * @evtid: Event id associated with the event file
- * @domid: The domain to which the event file belongs
- * @u: Name of the bit fields struct
+ * as kernfs private data.
+ * @u.rid: Resource id associated with the event file.
+ * @u.evtid: Event id associated with the event file.
+ * @u.sum: Set when event must be summed across multiple
+ * domains.
+ * @u.domid: When @u.sum is zero this is the domain to which
+ * the event file belongs. When @sum is one this
+ * is the id of the L3 cache that all domains to be
+ * summed share.
+ * @u: Name of the bit fields struct.
*/
union mon_data_bits {
void *priv;
struct {
unsigned int rid : 10;
- enum resctrl_event_id evtid : 8;
+ enum resctrl_event_id evtid : 7;
+ unsigned int sum : 1;
unsigned int domid : 14;
} u;
};
+/**
+ * struct rmid_read - Data passed across smp_call*() to read event count.
+ * @rgrp: Resource group for which the counter is being read. If it is a parent
+ * resource group then its event count is summed with the count from all
+ * its child resource groups.
+ * @r: Resource describing the properties of the event being read.
+ * @d: Domain that the counter should be read from. If NULL then sum all
+ * domains in @r sharing L3 @ci.id
+ * @evtid: Which monitor event to read.
+ * @first: Initialize MBM counter when true.
+ * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
+ * @err: Error encountered when reading counter.
+ * @val: Returned value of event counter. If @rgrp is a parent resource group,
+ * @val includes the sum of event counts from its child resource groups.
+ * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
+ * (summed across child resource groups if @rgrp is a parent resource group).
+ * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
+ */
struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
enum resctrl_event_id evtid;
bool first;
+ struct cacheinfo *ci;
int err;
u64 val;
+ void *arch_mon_ctx;
};
-extern bool rdt_alloc_capable;
-extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features;
extern struct list_head resctrl_schema_all;
+extern bool resctrl_mounted;
enum rdt_group_type {
RDTCTRL_GROUP = 0,
@@ -192,7 +257,7 @@ struct mongroup {
*/
struct pseudo_lock_region {
struct resctrl_schema *s;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
u32 cbm;
wait_queue_head_t lock_thread_wq;
int thread_done;
@@ -218,6 +283,7 @@ struct pseudo_lock_region {
* monitor only or ctrl_mon group
* @mon: mongroup related data
* @mode: mode of resource group
+ * @mba_mbps_event: input monitoring event id when mba_sc is enabled
* @plr: pseudo-locked region
*/
struct rdtgroup {
@@ -230,6 +296,7 @@ struct rdtgroup {
enum rdt_group_type type;
struct mongroup mon;
enum rdtgrp_mode mode;
+ enum resctrl_event_id mba_mbps_event;
struct pseudo_lock_region *plr;
};
@@ -296,14 +363,10 @@ struct rftype {
* struct mbm_state - status for each MBM counter in each domain
* @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps
- * @delta_bw: Difference between the current and previous bandwidth
- * @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
u64 prev_bw_bytes;
u32 prev_bw;
- u32 delta_bw;
- bool delta_comp;
};
/**
@@ -319,35 +382,53 @@ struct arch_mbm_state {
};
/**
- * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
- * a resource
+ * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a control function
* @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
+ *
+ * Members of this structure are accessed via helpers that provide abstraction.
+ */
+struct rdt_hw_ctrl_domain {
+ struct rdt_ctrl_domain d_resctrl;
+ u32 *ctrl_val;
+};
+
+/**
+ * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share
+ * a resource for a monitor function
+ * @d_resctrl: Properties exposed to the resctrl file system
* @arch_mbm_total: arch private state for MBM total bandwidth
* @arch_mbm_local: arch private state for MBM local bandwidth
*
* Members of this structure are accessed via helpers that provide abstraction.
*/
-struct rdt_hw_domain {
- struct rdt_domain d_resctrl;
- u32 *ctrl_val;
+struct rdt_hw_mon_domain {
+ struct rdt_mon_domain d_resctrl;
struct arch_mbm_state *arch_mbm_total;
struct arch_mbm_state *arch_mbm_local;
};
-static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
+static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r)
+{
+ return container_of(r, struct rdt_hw_ctrl_domain, d_resctrl);
+}
+
+static inline struct rdt_hw_mon_domain *resctrl_to_arch_mon_dom(struct rdt_mon_domain *r)
{
- return container_of(r, struct rdt_hw_domain, d_resctrl);
+ return container_of(r, struct rdt_hw_mon_domain, d_resctrl);
}
/**
* struct msr_param - set a range of MSRs from a domain
* @res: The resource to use
+ * @dom: The domain to update
* @low: Beginning index from base MSR
* @high: End index
*/
struct msr_param {
struct rdt_resource *res;
+ struct rdt_ctrl_domain *dom;
u32 low;
u32 high;
};
@@ -395,6 +476,8 @@ struct rdt_parse_data {
* @msr_update: Function pointer to update QOS MSRs
* @mon_scale: cqm counter * mon_scale = occupancy in bytes
* @mbm_width: Monitor width, to detect and correct for overflow.
+ * @mbm_cfg_mask: Bandwidth sources that can be tracked when Bandwidth
+ * Monitoring Event Configuration (BMEC) is supported.
* @cdp_enabled: CDP state of this resource
*
* Members of this structure are either private to the architecture
@@ -405,10 +488,10 @@ struct rdt_hw_resource {
struct rdt_resource r_resctrl;
u32 num_closid;
unsigned int msr_base;
- void (*msr_update) (struct rdt_domain *d, struct msr_param *m,
- struct rdt_resource *r);
+ void (*msr_update)(struct msr_param *m);
unsigned int mon_scale;
unsigned int mbm_width;
+ unsigned int mbm_cfg_mask;
bool cdp_enabled;
};
@@ -418,17 +501,16 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r
}
int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
+ struct rdt_ctrl_domain *d);
int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
- struct rdt_domain *d);
+ struct rdt_ctrl_domain *d);
extern struct mutex rdtgroup_mutex;
extern struct rdt_hw_resource rdt_resources_all[];
extern struct rdtgroup rdtgroup_default;
-DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
-
extern struct dentry *debugfs_resctrl;
+extern enum resctrl_event_id mba_mbps_default_event;
enum resctrl_res_level {
RDT_RESOURCE_L3,
@@ -455,6 +537,8 @@ static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l)
int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d);
+
/*
* To return the common struct rdt_resource, which is contained in struct
* rdt_hw_resource, walk the resctrl member of struct rdt_hw_resource.
@@ -520,50 +604,58 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn);
int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
umode_t mask);
-struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
- struct list_head **pos);
+struct rdt_domain_hdr *rdt_find_domain(struct list_head *h, int id,
+ struct list_head **pos);
ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off);
int rdtgroup_schemata_show(struct kernfs_open_file *of,
struct seq_file *s, void *v);
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive);
-unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm);
enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
int rdtgroup_tasks_assigned(struct rdtgroup *r);
int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm);
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
int rdt_pseudo_lock_init(void);
void rdt_pseudo_lock_release(void);
int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
-struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
+struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, struct rdt_resource *r);
+struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, struct rdt_resource *r);
int closids_supported(void);
void closid_free(int closid);
-int alloc_rmid(void);
-void free_rmid(u32 rmid);
+int alloc_rmid(u32 closid);
+void free_rmid(u32 closid, u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
+void __exit rdt_put_mon_l3_config(void);
bool __init rdt_cpu_has(int flag);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
- struct rdt_domain *d, struct rdtgroup *rdtgrp,
- int evtid, int first);
-void mbm_setup_overflow_handler(struct rdt_domain *dom,
- unsigned long delay_ms);
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first);
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
+ unsigned long delay_ms,
+ int exclude_cpu);
void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu);
void cqm_handle_limbo(struct work_struct *work);
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
-void __check_limbo(struct rdt_domain *d, bool force_free);
+bool has_busy_rmid(struct rdt_mon_domain *d);
+void __check_limbo(struct rdt_mon_domain *d, bool force_free);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
-void __init thread_throttle_mode_init(void);
-void __init mbm_config_rftype_init(const char *config);
+void resctrl_file_fflags_init(const char *config, unsigned long fflags);
void rdt_staged_configs_clear(void);
-
+bool closid_allocated(unsigned int closid);
+int resctrl_find_cleanest_closid(void);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f136ac046851..94a1d9780461 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -15,6 +15,9 @@
* Software Developer Manual June 2016, volume 3, section 17.17.
*/
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/sizes.h>
#include <linux/slab.h>
@@ -23,8 +26,22 @@
#include <asm/resctrl.h>
#include "internal.h"
-
+#include "trace.h"
+
+/**
+ * struct rmid_entry - dirty tracking for all RMID.
+ * @closid: The CLOSID for this entry.
+ * @rmid: The RMID for this entry.
+ * @busy: The number of domains with cached data using this RMID.
+ * @list: Member of the rmid_free_lru list when busy == 0.
+ *
+ * Depending on the architecture the correct monitor is accessed using
+ * both @closid and @rmid, or @rmid only.
+ *
+ * Take the rdtgroup_mutex when accessing.
+ */
struct rmid_entry {
+ u32 closid;
u32 rmid;
int busy;
struct list_head list;
@@ -38,6 +55,13 @@ struct rmid_entry {
static LIST_HEAD(rmid_free_lru);
/*
+ * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
+ * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
+ * Indexed by CLOSID. Protected by rdtgroup_mutex.
+ */
+static u32 *closid_num_dirty_rmid;
+
+/*
* @rmid_limbo_count - count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
@@ -75,6 +99,8 @@ unsigned int resctrl_rmid_realloc_limit;
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+static int snc_nodes_per_l3_cache = 1;
+
/*
* The correction factor table is documented in Documentation/arch/x86/resctrl.rst.
* If rmid > rmid threshold, MBM total and local values should be multiplied
@@ -136,17 +162,70 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
return val;
}
-static inline struct rmid_entry *__rmid_entry(u32 rmid)
+/*
+ * x86 and arm64 differ in their handling of monitoring.
+ * x86's RMID are independent numbers, there is only one source of traffic
+ * with an RMID value of '1'.
+ * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
+ * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
+ * value is no longer unique.
+ * To account for this, resctrl uses an index. On x86 this is just the RMID,
+ * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
+ *
+ * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
+ * must accept an attempt to read every index.
+ */
+static inline struct rmid_entry *__rmid_entry(u32 idx)
{
struct rmid_entry *entry;
+ u32 closid, rmid;
+
+ entry = &rmid_ptrs[idx];
+ resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
- entry = &rmid_ptrs[rmid];
- WARN_ON(entry->rmid != rmid);
+ WARN_ON_ONCE(entry->closid != closid);
+ WARN_ON_ONCE(entry->rmid != rmid);
return entry;
}
-static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+/*
+ * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by
+ * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is
+ * needed. The physical RMID is the same as the logical RMID.
+ *
+ * On a platform with SNC mode enabled, Linux enables RMID sharing mode
+ * via MSR 0xCA0 (see the "RMID Sharing Mode" section in the "Intel
+ * Resource Director Technology Architecture Specification" for a full
+ * description of RMID sharing mode).
+ *
+ * In RMID sharing mode there are fewer "logical RMID" values available
+ * to accumulate data ("physical RMIDs" are divided evenly between SNC
+ * nodes that share an L3 cache). Linux creates an rdt_mon_domain for
+ * each SNC node.
+ *
+ * The value loaded into IA32_PQR_ASSOC is the "logical RMID".
+ *
+ * Data is collected independently on each SNC node and can be retrieved
+ * using the "physical RMID" value computed by this function and loaded
+ * into IA32_QM_EVTSEL. @cpu can be any CPU in the SNC node.
+ *
+ * The scope of the IA32_QM_EVTSEL and IA32_QM_CTR MSRs is at the L3
+ * cache. So a "physical RMID" may be read from any CPU that shares
+ * the L3 cache with the desired SNC node, not just from a CPU in
+ * the specific SNC node.
+ */
+static int logical_rmid_to_physical_rmid(int cpu, int lrmid)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+
+ if (snc_nodes_per_l3_cache == 1)
+ return lrmid;
+
+ return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid;
+}
+
+static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val)
{
u64 msr_val;
@@ -158,7 +237,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
* IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
* are error bits.
*/
- wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid);
rdmsrl(MSR_IA32_QM_CTR, msr_val);
if (msr_val & RMID_VAL_ERROR)
@@ -170,7 +249,7 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
return 0;
}
-static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_dom,
u32 rmid,
enum resctrl_event_id eventid)
{
@@ -189,18 +268,22 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
return NULL;
}
-void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid)
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid,
+ enum resctrl_event_id eventid)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
+ u32 prmid;
am = get_arch_mbm_state(hw_dom, rmid, eventid);
if (am) {
memset(am, 0, sizeof(*am));
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
/* Record any initial, non-zero count value. */
- __rmid_read(rmid, eventid, &am->prev_msr);
+ __rmid_read_phys(prmid, eventid, &am->prev_msr);
}
}
@@ -208,9 +291,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
* Assumes that hardware counters are also reset and thus that there is
* no need to record initial non-zero counts.
*/
-void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d)
{
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
if (is_mbm_total_enabled())
memset(hw_dom->arch_mbm_total, 0,
@@ -229,19 +312,22 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
return chunks >> shift;
}
-int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
- u32 rmid, enum resctrl_event_id eventid, u64 *val)
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 unused, u32 rmid, enum resctrl_event_id eventid,
+ u64 *val, void *ignored)
{
+ struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d);
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
struct arch_mbm_state *am;
u64 msr_val, chunks;
+ u32 prmid;
int ret;
- if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
- return -EINVAL;
+ resctrl_arch_rmid_read_context_check();
- ret = __rmid_read(rmid, eventid, &msr_val);
+ prmid = logical_rmid_to_physical_rmid(cpu, rmid);
+ ret = __rmid_read_phys(prmid, eventid, &msr_val);
if (ret)
return ret;
@@ -260,20 +346,40 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
return 0;
}
+static void limbo_release_entry(struct rmid_entry *entry)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]--;
+}
+
/*
* Check the RMIDs that are marked as busy for this domain. If the
* reported LLC occupancy is below the threshold clear the busy bit and
* decrement the count. If the busy count gets to zero on an RMID, we
* free the RMID
*/
-void __check_limbo(struct rdt_domain *d, bool force_free)
+void __check_limbo(struct rdt_mon_domain *d, bool force_free)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
struct rmid_entry *entry;
- u32 crmid = 1, nrmid;
+ u32 idx, cur_idx = 1;
+ void *arch_mon_ctx;
bool rmid_dirty;
u64 val = 0;
+ arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
+ if (IS_ERR(arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(arch_mon_ctx));
+ return;
+ }
+
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
* are marked as busy for occupancy < threshold. If the occupancy
@@ -281,101 +387,187 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
* RMID and move it to the free list when the counter reaches 0.
*/
for (;;) {
- nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
- if (nrmid >= r->num_rmid)
+ idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
+ if (idx >= idx_limit)
break;
- entry = __rmid_entry(nrmid);
-
- if (resctrl_arch_rmid_read(r, d, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID, &val)) {
+ entry = __rmid_entry(idx);
+ if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID, &val,
+ arch_mon_ctx)) {
rmid_dirty = true;
} else {
rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
}
if (force_free || !rmid_dirty) {
- clear_bit(entry->rmid, d->rmid_busy_llc);
- if (!--entry->busy) {
- rmid_limbo_count--;
- list_add_tail(&entry->list, &rmid_free_lru);
- }
+ clear_bit(idx, d->rmid_busy_llc);
+ if (!--entry->busy)
+ limbo_release_entry(entry);
}
- crmid = nrmid + 1;
+ cur_idx = idx + 1;
}
+
+ resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
}
-bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
+bool has_busy_rmid(struct rdt_mon_domain *d)
{
- return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+
+ return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
+}
+
+static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
+{
+ struct rmid_entry *itr;
+ u32 itr_idx, cmp_idx;
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
+
+ list_for_each_entry(itr, &rmid_free_lru, list) {
+ /*
+ * Get the index of this free RMID, and the index it would need
+ * to be if it were used with this CLOSID.
+ * If the CLOSID is irrelevant on this architecture, the two
+ * index values are always the same on every entry and thus the
+ * very first entry will be returned.
+ */
+ itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
+ cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
+
+ if (itr_idx == cmp_idx)
+ return itr;
+ }
+
+ return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
+ * RMID are clean, or the CLOSID that has
+ * the most clean RMID.
+ *
+ * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
+ * may not be able to allocate clean RMID. To avoid this the allocator will
+ * choose the CLOSID with the most clean RMID.
+ *
+ * When the CLOSID and RMID are independent numbers, the first free CLOSID will
+ * be returned.
+ */
+int resctrl_find_cleanest_closid(void)
+{
+ u32 cleanest_closid = ~0;
+ int i = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ return -EIO;
+
+ for (i = 0; i < closids_supported(); i++) {
+ int num_dirty;
+
+ if (closid_allocated(i))
+ continue;
+
+ num_dirty = closid_num_dirty_rmid[i];
+ if (num_dirty == 0)
+ return i;
+
+ if (cleanest_closid == ~0)
+ cleanest_closid = i;
+
+ if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
+ cleanest_closid = i;
+ }
+
+ if (cleanest_closid == ~0)
+ return -ENOSPC;
+
+ return cleanest_closid;
}
/*
- * As of now the RMIDs allocation is global.
- * However we keep track of which packages the RMIDs
- * are used to optimize the limbo list management.
+ * For MPAM the RMID value is not unique, and has to be considered with
+ * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
+ * allows all domains to be managed by a single free list.
+ * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
*/
-int alloc_rmid(void)
+int alloc_rmid(u32 closid)
{
struct rmid_entry *entry;
lockdep_assert_held(&rdtgroup_mutex);
- if (list_empty(&rmid_free_lru))
- return rmid_limbo_count ? -EBUSY : -ENOSPC;
+ entry = resctrl_find_free_rmid(closid);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
- entry = list_first_entry(&rmid_free_lru,
- struct rmid_entry, list);
list_del(&entry->list);
-
return entry->rmid;
}
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- struct rdt_domain *d;
- int cpu, err;
- u64 val = 0;
+ struct rdt_mon_domain *d;
+ u32 idx;
- entry->busy = 0;
- cpu = get_cpu();
- list_for_each_entry(d, &r->domains, list) {
- if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- err = resctrl_arch_rmid_read(r, d, entry->rmid,
- QOS_L3_OCCUP_EVENT_ID,
- &val);
- if (err || val <= resctrl_rmid_realloc_threshold)
- continue;
- }
+ lockdep_assert_held(&rdtgroup_mutex);
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
+
+ entry->busy = 0;
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
/*
* For the first limbo RMID in the domain,
* setup up the limbo worker.
*/
- if (!has_busy_rmid(r, d))
- cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
- set_bit(entry->rmid, d->rmid_busy_llc);
+ if (!has_busy_rmid(d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
+ set_bit(idx, d->rmid_busy_llc);
entry->busy++;
}
- put_cpu();
- if (entry->busy)
- rmid_limbo_count++;
- else
- list_add_tail(&entry->list, &rmid_free_lru);
+ rmid_limbo_count++;
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]++;
}
-void free_rmid(u32 rmid)
+void free_rmid(u32 closid, u32 rmid)
{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
struct rmid_entry *entry;
- if (!rmid)
- return;
-
lockdep_assert_held(&rdtgroup_mutex);
- entry = __rmid_entry(rmid);
+ /*
+ * Do not allow the default rmid to be free'd. Comparing by index
+ * allows architectures that ignore the closid parameter to avoid an
+ * unnecessary check.
+ */
+ if (!resctrl_arch_mon_capable() ||
+ idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID))
+ return;
+
+ entry = __rmid_entry(idx);
if (is_llc_occupancy_enabled())
add_rmid_to_limbo(entry);
@@ -383,44 +575,84 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static struct mbm_state *get_mbm_state(struct rdt_domain *d, u32 rmid,
- enum resctrl_event_id evtid)
+static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
+ u32 rmid, enum resctrl_event_id evtid)
{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+
switch (evtid) {
case QOS_L3_MBM_TOTAL_EVENT_ID:
- return &d->mbm_total[rmid];
+ return &d->mbm_total[idx];
case QOS_L3_MBM_LOCAL_EVENT_ID:
- return &d->mbm_local[rmid];
+ return &d->mbm_local[idx];
default:
return NULL;
}
}
-static int __mon_event_count(u32 rmid, struct rmid_read *rr)
+static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
+ int cpu = smp_processor_id();
+ struct rdt_mon_domain *d;
struct mbm_state *m;
+ int err, ret;
u64 tval = 0;
if (rr->first) {
- resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
- m = get_mbm_state(rr->d, rmid, rr->evtid);
+ resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
if (m)
memset(m, 0, sizeof(struct mbm_state));
return 0;
}
- rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
- if (rr->err)
- return rr->err;
+ if (rr->d) {
+ /* Reading a single domain, must be on a CPU in that domain. */
+ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
+ return -EINVAL;
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->err)
+ return rr->err;
- rr->val += tval;
+ rr->val += tval;
- return 0;
+ return 0;
+ }
+
+ /* Summing domains that share a cache, must be on a CPU for that cache. */
+ if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map))
+ return -EINVAL;
+
+ /*
+ * Legacy files must report the sum of an event across all
+ * domains that share the same L3 cache instance.
+ * Report success if a read from any domain succeeds, -EINVAL
+ * (translated to "Unavailable" for user space) if reading from
+ * all domains fail for any reason.
+ */
+ ret = -EINVAL;
+ list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
+ if (d->ci->id != rr->ci->id)
+ continue;
+ err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (!err) {
+ rr->val += tval;
+ ret = 0;
+ }
+ }
+
+ if (ret)
+ rr->err = ret;
+
+ return ret;
}
/*
* mbm_bw_count() - Update bw count from values previously read by
* __mon_event_count().
+ * @closid: The closid used to identify the cached mbm_state.
* @rmid: The rmid used to identify the cached mbm_state.
* @rr: The struct rmid_read populated by __mon_event_count().
*
@@ -429,10 +661,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
* __mon_event_count() is compared with the chunks value from the previous
* invocation. This must be called once per second to maintain values in MBps.
*/
-static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
+static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
{
- struct mbm_state *m = &rr->d->mbm_local[rmid];
u64 cur_bw, bytes, cur_bytes;
+ struct mbm_state *m;
+
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+ if (WARN_ON_ONCE(!m))
+ return;
cur_bytes = rr->val;
bytes = cur_bytes - m->prev_bw_bytes;
@@ -440,14 +676,11 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
cur_bw = bytes / SZ_1M;
- if (m->delta_comp)
- m->delta_bw = abs(cur_bw - m->prev_bw);
- m->delta_comp = false;
m->prev_bw = cur_bw;
}
/*
- * This is called via IPI to read the CQM/MBM counters
+ * This is scheduled by mon_event_read() to read the CQM/MBM counters
* on a domain.
*/
void mon_event_count(void *info)
@@ -459,7 +692,7 @@ void mon_event_count(void *info)
rdtgrp = rr->rgrp;
- ret = __mon_event_count(rdtgrp->mon.rmid, rr);
+ ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
/*
* For Ctrl groups read data from child monitor groups and
@@ -470,7 +703,8 @@ void mon_event_count(void *info)
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- if (__mon_event_count(entry->mon.rmid, rr) == 0)
+ if (__mon_event_count(entry->closid, entry->mon.rmid,
+ rr) == 0)
ret = 0;
}
}
@@ -516,26 +750,27 @@ void mon_event_count(void *info)
* throttle MSRs already have low percentage values. To avoid
* unnecessarily restricting such rdtgroups, we also increase the bandwidth.
*/
-static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
{
u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
- u32 cur_bw, delta_bw, user_bw;
+ struct rdt_ctrl_domain *dom_mba;
+ enum resctrl_event_id evt_id;
struct rdt_resource *r_mba;
- struct rdt_domain *dom_mba;
struct list_head *head;
struct rdtgroup *entry;
-
- if (!is_mbm_local_enabled())
- return;
+ u32 cur_bw, user_bw;
r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+ evt_id = rgrp->mba_mbps_event;
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
- pmbm_data = &dom_mbm->mbm_local[rmid];
+ pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
+ if (WARN_ON_ONCE(!pmbm_data))
+ return;
- dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+ dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
if (!dom_mba) {
pr_warn_once("Failure to get domain for MBA update\n");
return;
@@ -543,7 +778,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
cur_bw = pmbm_data->prev_bw;
user_bw = dom_mba->mbps_val[closid];
- delta_bw = pmbm_data->delta_bw;
/* MBA resource doesn't support CDP */
cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
@@ -553,83 +787,76 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
*/
head = &rgrp->mon.crdtgrp_list;
list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+ cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
+ if (WARN_ON_ONCE(!cmbm_data))
+ return;
cur_bw += cmbm_data->prev_bw;
- delta_bw += cmbm_data->delta_bw;
}
/*
* Scale up/down the bandwidth linearly for the ctrl group. The
* bandwidth step is the bandwidth granularity specified by the
* hardware.
- *
- * The delta_bw is used when increasing the bandwidth so that we
- * dont alternately increase and decrease the control values
- * continuously.
- *
- * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
- * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
- * switching between 90 and 110 continuously if we only check
- * cur_bw < user_bw.
+ * Always increase throttling if current bandwidth is above the
+ * target set by user.
+ * But avoid thrashing up and down on every poll by checking
+ * whether a decrease in throttling is likely to push the group
+ * back over target. E.g. if currently throttling to 30% of bandwidth
+ * on a system with 10% granularity steps, check whether moving to
+ * 40% would go past the limit by multiplying current bandwidth by
+ * "(30 + 10) / 30".
*/
if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
} else if (cur_msr_val < MAX_MBA_BW &&
- (user_bw > (cur_bw + delta_bw))) {
+ (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
} else {
return;
}
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
-
- /*
- * Delta values are updated dynamically package wise for each
- * rdtgrp every time the throttle MSR changes value.
- *
- * This is because (1)the increase in bandwidth is not perfectly
- * linear and only "approximately" linear even when the hardware
- * says it is linear.(2)Also since MBA is a core specific
- * mechanism, the delta values vary based on number of cores used
- * by the rdtgrp.
- */
- pmbm_data->delta_comp = true;
- list_for_each_entry(entry, head, mon.crdtgrp_list) {
- cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
- cmbm_data->delta_comp = true;
- }
}
-static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id evtid)
{
- struct rmid_read rr;
+ struct rmid_read rr = {0};
- rr.first = false;
rr.r = r;
rr.d = d;
+ rr.evtid = evtid;
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
/*
- * This is protected from concurrent reads from user
- * as both the user and we hold the global mutex.
+ * If the software controller is enabled, compute the
+ * bandwidth for this event id.
*/
- if (is_mbm_total_enabled()) {
- rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
- rr.val = 0;
- __mon_event_count(rmid, &rr);
- }
- if (is_mbm_local_enabled()) {
- rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
- rr.val = 0;
- __mon_event_count(rmid, &rr);
+ if (is_mba_sc(NULL))
+ mbm_bw_count(closid, rmid, &rr);
- /*
- * Call the MBA software controller only for the
- * control groups and when user has enabled
- * the software controller explicitly.
- */
- if (is_mba_sc(NULL))
- mbm_bw_count(rmid, &rr);
- }
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
+
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid)
+{
+ /*
+ * This is protected from concurrent reads from user as both
+ * the user and overflow handler hold the global mutex.
+ */
+ if (is_mbm_total_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ if (is_mbm_local_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
}
/*
@@ -639,106 +866,193 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
void cqm_handle_limbo(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
- int cpu = smp_processor_id();
- struct rdt_resource *r;
- struct rdt_domain *d;
+ struct rdt_mon_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, cqm_limbo.work);
+ d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
__check_limbo(d, false);
- if (has_busy_rmid(r, d))
- schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
+ if (has_busy_rmid(d)) {
+ d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
+ delay);
+ }
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
-void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
+/**
+ * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
+ * domain.
+ * @dom: The domain the limbo handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- cpu = cpumask_any(&dom->cpu_mask);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->cqm_work_cpu = cpu;
- schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
}
void mbm_handle_overflow(struct work_struct *work)
{
unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
struct rdtgroup *prgrp, *crgrp;
- int cpu = smp_processor_id();
+ struct rdt_mon_domain *d;
struct list_head *head;
struct rdt_resource *r;
- struct rdt_domain *d;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- if (!static_branch_likely(&rdt_mon_enable_key))
+ /*
+ * If the filesystem has been unmounted this work no longer needs to
+ * run.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
goto out_unlock;
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- d = container_of(work, struct rdt_domain, mbm_over.work);
+ d = container_of(work, struct rdt_mon_domain, mbm_over.work);
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- mbm_update(r, d, prgrp->mon.rmid);
+ mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
- mbm_update(r, d, crgrp->mon.rmid);
+ mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
}
- schedule_delayed_work_on(cpu, &d->mbm_over, delay);
+ /*
+ * Re-check for housekeeping CPUs. This allows the overflow handler to
+ * move off a nohz_full CPU quickly.
+ */
+ d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
out_unlock:
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
}
-void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
+/**
+ * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
+ * domain.
+ * @dom: The domain the overflow handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
{
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- if (!static_branch_likely(&rdt_mon_enable_key))
+ /*
+ * When a domain comes online there is no guarantee the filesystem is
+ * mounted. If not, there is no need to catch counter overflow.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
return;
- cpu = cpumask_any(&dom->cpu_mask);
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
dom->mbm_work_cpu = cpu;
- schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
}
static int dom_data_init(struct rdt_resource *r)
{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ u32 num_closid = resctrl_arch_get_num_closid(r);
struct rmid_entry *entry = NULL;
- int i, nr_rmids;
+ int err = 0, i;
+ u32 idx;
- nr_rmids = r->num_rmid;
- rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
- if (!rmid_ptrs)
- return -ENOMEM;
+ mutex_lock(&rdtgroup_mutex);
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ u32 *tmp;
- for (i = 0; i < nr_rmids; i++) {
+ /*
+ * If the architecture hasn't provided a sanitised value here,
+ * this may result in larger arrays than necessary. Resctrl will
+ * use a smaller system wide value based on the resources in
+ * use.
+ */
+ tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ closid_num_dirty_rmid = tmp;
+ }
+
+ rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < idx_limit; i++) {
entry = &rmid_ptrs[i];
INIT_LIST_HEAD(&entry->list);
- entry->rmid = i;
+ resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
list_add_tail(&entry->list, &rmid_free_lru);
}
/*
- * RMID 0 is special and is always allocated. It's used for all
- * tasks that are not monitored.
+ * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
+ * are always allocated. These are used for the rdtgroup_default
+ * control group, which will be setup later in rdtgroup_init().
*/
- entry = __rmid_entry(0);
+ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID);
+ entry = __rmid_entry(idx);
list_del(&entry->list);
- return 0;
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+static void __exit dom_data_exit(void)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+
+ kfree(rmid_ptrs);
+ rmid_ptrs = NULL;
+
+ mutex_unlock(&rdtgroup_mutex);
}
static struct mon_evt llc_occupancy_event = {
@@ -775,6 +1089,89 @@ static void l3_mon_evt_init(struct rdt_resource *r)
list_add_tail(&mbm_local_event.list, &r->evt_list);
}
+/*
+ * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1
+ * which indicates that RMIDs are configured in legacy mode.
+ * This mode is incompatible with Linux resctrl semantics
+ * as RMIDs are partitioned between SNC nodes, which requires
+ * a user to know which RMID is allocated to a task.
+ * Clearing bit 0 reconfigures the RMID counters for use
+ * in RMID sharing mode. This mode is better for Linux.
+ * The RMID space is divided between all SNC nodes with the
+ * RMIDs renumbered to start from zero in each node when
+ * counting operations from tasks. Code to read the counters
+ * must adjust RMID counter numbers based on SNC node. See
+ * logical_rmid_to_physical_rmid() for code that does this.
+ */
+void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ if (snc_nodes_per_l3_cache > 1)
+ msr_clear_bit(MSR_RMID_SNC_CONFIG, 0);
+}
+
+/* CPU models that support MSR_RMID_SNC_CONFIG */
+static const struct x86_cpu_id snc_cpu_ids[] __initconst = {
+ X86_MATCH_VFM(INTEL_ICELAKE_X, 0),
+ X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, 0),
+ X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, 0),
+ {}
+};
+
+/*
+ * There isn't a simple hardware bit that indicates whether a CPU is running
+ * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the
+ * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in
+ * the same NUMA node as CPU0.
+ * It is not possible to accurately determine SNC state if the system is
+ * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes
+ * to L3 caches. It will be OK if system is booted with hyperthreading
+ * disabled (since this doesn't affect the ratio).
+ */
+static __init int snc_get_config(void)
+{
+ struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE);
+ const cpumask_t *node0_cpumask;
+ int cpus_per_node, cpus_per_l3;
+ int ret;
+
+ if (!x86_match_cpu(snc_cpu_ids) || !ci)
+ return 1;
+
+ cpus_read_lock();
+ if (num_online_cpus() != num_present_cpus())
+ pr_warn("Some CPUs offline, SNC detection may be incorrect\n");
+ cpus_read_unlock();
+
+ node0_cpumask = cpumask_of_node(cpu_to_node(0));
+
+ cpus_per_node = cpumask_weight(node0_cpumask);
+ cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map);
+
+ if (!cpus_per_node || !cpus_per_l3)
+ return 1;
+
+ ret = cpus_per_l3 / cpus_per_node;
+
+ /* sanity check: Only valid results are 1, 2, 3, 4, 6 */
+ switch (ret) {
+ case 1:
+ break;
+ case 2 ... 4:
+ case 6:
+ pr_info("Sub-NUMA Cluster mode detected with %d nodes per L3 cache\n", ret);
+ rdt_resources_all[RDT_RESOURCE_L3].r_resctrl.mon_scope = RESCTRL_L3_NODE;
+ break;
+ default:
+ pr_warn("Ignore improbable SNC node count %d\n", ret);
+ ret = 1;
+ break;
+ }
+
+ return ret;
+}
+
int __init rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
@@ -782,9 +1179,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
unsigned int threshold;
int ret;
+ snc_nodes_per_l3_cache = snc_get_config();
+
resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
- hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
- r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
+ hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache;
+ r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
@@ -813,13 +1212,21 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return ret;
if (rdt_cpu_has(X86_FEATURE_BMEC)) {
+ u32 eax, ebx, ecx, edx;
+
+ /* Detect list of bandwidth sources that can be tracked */
+ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx);
+ hw_res->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS;
+
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
mbm_total_event.configurable = true;
- mbm_config_rftype_init("mbm_total_bytes_config");
+ resctrl_file_fflags_init("mbm_total_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
mbm_local_event.configurable = true;
- mbm_config_rftype_init("mbm_local_bytes_config");
+ resctrl_file_fflags_init("mbm_local_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
}
}
@@ -830,6 +1237,11 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
return 0;
}
+void __exit rdt_put_mon_l3_config(void)
+{
+ dom_data_exit();
+}
+
void __init intel_rdt_mbm_apply_quirk(void)
{
int cf_index;
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 8f559eeae08e..42cc162f7fc9 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
@@ -23,7 +22,7 @@
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
@@ -31,7 +30,7 @@
#include "internal.h"
#define CREATE_TRACE_POINTS
-#include "pseudo_lock_event.h"
+#include "trace.h"
/*
* The bits needed to disable hardware prefetching varies based on the
@@ -88,8 +87,8 @@ static u64 get_prefetch_disable_bits(void)
boot_cpu_data.x86 != 6)
return 0;
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -100,8 +99,8 @@ static u64 get_prefetch_disable_bits(void)
* 63:4 Reserved
*/
return 0xF;
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
/*
* SDM defines bits of MSR_MISC_FEATURE_CONTROL register
* as:
@@ -221,7 +220,7 @@ static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
int cpu;
int ret;
- for_each_cpu(cpu, &plr->d->cpu_mask) {
+ for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
if (!pm_req) {
rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
@@ -292,12 +291,15 @@ static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
*/
static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
{
- struct cpu_cacheinfo *ci;
+ enum resctrl_scope scope = plr->s->res->ctrl_scope;
+ struct cacheinfo *ci;
int ret;
- int i;
+
+ if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
+ return -ENODEV;
/* Pick the first cpu we find that is associated with the cache. */
- plr->cpu = cpumask_first(&plr->d->cpu_mask);
+ plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(plr->cpu)) {
rdt_last_cmd_printf("CPU %u associated with cache not online\n",
@@ -306,15 +308,11 @@ static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
goto out_region;
}
- ci = get_cpu_cacheinfo(plr->cpu);
-
- plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
-
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == plr->s->res->cache_level) {
- plr->line_size = ci->info_list[i].coherency_line_size;
- return 0;
- }
+ ci = get_cpu_cacheinfo_level(plr->cpu, scope);
+ if (ci) {
+ plr->line_size = ci->coherency_line_size;
+ plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
+ return 0;
}
ret = -1;
@@ -461,7 +459,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* increase likelihood that allocated cache portion will be filled
* with associated memory.
*/
- native_wbinvd();
+ wbinvd();
/*
* Always called with interrupts enabled. By disabling interrupts
@@ -581,7 +579,7 @@ static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
if (ret)
goto err_cpus;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
if (ret)
goto err_cpus_list;
@@ -628,7 +626,7 @@ static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
if (ret)
goto err_cpus;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
if (ret)
goto err_cpus_list;
@@ -752,7 +750,7 @@ int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
* anymore when this group would be used for pseudo-locking. This
* is safe to call on platforms not capable of monitoring.
*/
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
ret = 0;
goto out;
@@ -776,8 +774,8 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
{
int ret;
- if (rdt_mon_capable) {
- ret = alloc_rmid();
+ if (resctrl_arch_mon_capable()) {
+ ret = alloc_rmid(rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("Out of RMIDs\n");
return ret;
@@ -787,7 +785,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
ret = rdtgroup_locksetup_user_restore(rdtgrp);
if (ret) {
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
return ret;
}
@@ -810,7 +808,7 @@ int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
* Return: true if @cbm overlaps with pseudo-locked region on @d, false
* otherwise.
*/
-bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm)
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
{
unsigned int cbm_len;
unsigned long cbm_b;
@@ -837,13 +835,16 @@ bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, unsigned long cbm
* if it is not possible to test due to memory allocation issue,
* false otherwise.
*/
-bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
{
+ struct rdt_ctrl_domain *d_i;
cpumask_var_t cpu_with_psl;
struct rdt_resource *r;
- struct rdt_domain *d_i;
bool ret = false;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
return true;
@@ -852,10 +853,10 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* associated with them.
*/
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(d_i, &r->domains, list) {
+ list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
if (d_i->plr)
cpumask_or(cpu_with_psl, cpu_with_psl,
- &d_i->cpu_mask);
+ &d_i->hdr.cpu_mask);
}
}
@@ -863,7 +864,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* Next test if new pseudo-locked region would intersect with
* existing region.
*/
- if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
+ if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
ret = true;
free_cpumask_var(cpu_with_psl);
@@ -1081,9 +1082,9 @@ static int measure_l2_residency(void *_plr)
* L2_HIT 02H
* L2_MISS 10H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_ATOM_GOLDMONT:
- case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_ATOM_GOLDMONT:
+ case INTEL_ATOM_GOLDMONT_PLUS:
perf_miss_attr.config = X86_CONFIG(.event = 0xd1,
.umask = 0x10);
perf_hit_attr.config = X86_CONFIG(.event = 0xd1,
@@ -1120,8 +1121,8 @@ static int measure_l3_residency(void *_plr)
* MISS 41H
*/
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_BROADWELL_X:
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_BROADWELL_X:
/* On BDW the hit event counts references, not hits */
perf_hit_attr.config = X86_CONFIG(.event = 0x2e,
.umask = 0x4f);
@@ -1139,7 +1140,7 @@ static int measure_l3_residency(void *_plr)
*/
counts.miss_after -= counts.miss_before;
- if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X) {
+ if (boot_cpu_data.x86_vfm == INTEL_BROADWELL_X) {
/*
* On BDW references and misses are counted, need to adjust.
* Sometimes the "hits" counter is a bit more than the
@@ -1195,7 +1196,7 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
}
plr->thread_done = 0;
- cpu = cpumask_first(&plr->d->cpu_mask);
+ cpu = cpumask_first(&plr->d->hdr.cpu_mask);
if (!cpu_online(cpu)) {
ret = -ENODEV;
goto out;
@@ -1204,20 +1205,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
plr->cpu = cpu;
if (sel == 1)
- thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
+ cpu, "pseudo_lock_measure/%u");
else if (sel == 2)
- thread = kthread_create_on_node(measure_l2_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(measure_l2_residency, plr,
+ cpu, "pseudo_lock_measure/%u");
else if (sel == 3)
- thread = kthread_create_on_node(measure_l3_residency, plr,
- cpu_to_node(cpu),
- "pseudo_lock_measure/%u",
- cpu);
+ thread = kthread_run_on_cpu(measure_l3_residency, plr,
+ cpu, "pseudo_lock_measure/%u");
else
goto out;
@@ -1225,8 +1220,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
ret = PTR_ERR(thread);
goto out;
}
- kthread_bind(thread, cpu);
- wake_up_process(thread);
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
@@ -1314,18 +1307,14 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
plr->thread_done = 0;
- thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
- cpu_to_node(plr->cpu),
- "pseudo_lock/%u", plr->cpu);
+ thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
+ plr->cpu, "pseudo_lock/%u");
if (IS_ERR(thread)) {
ret = PTR_ERR(thread);
rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
goto out_cstates;
}
- kthread_bind(thread, plr->cpu);
- wake_up_process(thread);
-
ret = wait_event_interruptible(plr->lock_thread_wq,
plr->thread_done == 1);
if (ret < 0) {
@@ -1525,7 +1514,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
* may be scheduled elsewhere and invalidate entries in the
* pseudo-locked region.
*/
- if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
+ if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
mutex_unlock(&rdtgroup_mutex);
return -EINVAL;
}
@@ -1566,7 +1555,6 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
static const struct file_operations pseudo_lock_dev_fops = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = NULL,
.write = NULL,
.open = pseudo_lock_dev_open,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 69a1de92384a..6419e04d8a7b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -12,7 +12,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
@@ -35,6 +34,10 @@
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
static struct kernfs_root *rdt_root;
struct rdtgroup rdtgroup_default;
LIST_HEAD(rdt_all_groups);
@@ -42,6 +45,9 @@ LIST_HEAD(rdt_all_groups);
/* list of entries for the schemata file */
LIST_HEAD(resctrl_schema_all);
+/* The filesystem can only be mounted once. */
+bool resctrl_mounted;
+
/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;
@@ -59,6 +65,15 @@ static void rdtgroup_destroy_root(void);
struct dentry *debugfs_resctrl;
+/*
+ * Memory bandwidth monitoring event to use for the default CTRL_MON group
+ * and each new CTRL_MON group created by the user. Only relevant when
+ * the filesystem is mounted with the "mba_MBps" option so it does not
+ * matter that it remains uninitialized on systems that do not support
+ * the "mba_MBps" option.
+ */
+enum resctrl_event_id mba_mbps_default_event;
+
static bool resctrl_debug;
void rdt_last_cmd_clear(void)
@@ -85,13 +100,13 @@ void rdt_last_cmd_printf(const char *fmt, ...)
void rdt_staged_configs_clear(void)
{
+ struct rdt_ctrl_domain *dom;
struct rdt_resource *r;
- struct rdt_domain *dom;
lockdep_assert_held(&rdtgroup_mutex);
for_each_alloc_capable_rdt_resource(r) {
- list_for_each_entry(dom, &r->domains, list)
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
memset(dom->staged_config, 0, sizeof(dom->staged_config));
}
}
@@ -102,7 +117,7 @@ void rdt_staged_configs_clear(void)
*
* Using a global CLOSID across all resources has some advantages and
* some drawbacks:
- * + We can simply set "current->closid" to assign a task to a resource
+ * + We can simply set current's closid to assign a task to a resource
* group.
* + Context switch code can avoid extra memory references deciding which
* CLOSID to load into the PQR_ASSOC MSR
@@ -111,7 +126,7 @@ void rdt_staged_configs_clear(void)
* - Our choices on how to configure each resource become progressively more
* limited as the number of resources grows.
*/
-static int closid_free_map;
+static unsigned long closid_free_map;
static int closid_free_map_len;
int closids_supported(void)
@@ -130,26 +145,39 @@ static void closid_init(void)
closid_free_map = BIT_MASK(rdt_min_closid) - 1;
- /* CLOSID 0 is always reserved for the default group */
- closid_free_map &= ~1;
+ /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
+ __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map);
closid_free_map_len = rdt_min_closid;
}
static int closid_alloc(void)
{
- u32 closid = ffs(closid_free_map);
+ int cleanest_closid;
+ u32 closid;
- if (closid == 0)
- return -ENOSPC;
- closid--;
- closid_free_map &= ~(1 << closid);
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ cleanest_closid = resctrl_find_cleanest_closid();
+ if (cleanest_closid < 0)
+ return cleanest_closid;
+ closid = cleanest_closid;
+ } else {
+ closid = ffs(closid_free_map);
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ }
+ __clear_bit(closid, &closid_free_map);
return closid;
}
void closid_free(int closid)
{
- closid_free_map |= 1 << closid;
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ __set_bit(closid, &closid_free_map);
}
/**
@@ -159,9 +187,11 @@ void closid_free(int closid)
* Return: true if @closid is currently associated with a resource group,
* false if @closid is free
*/
-static bool closid_allocated(unsigned int closid)
+bool closid_allocated(unsigned int closid)
{
- return (closid_free_map & (1 << closid)) == 0;
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ return !test_bit(closid, &closid_free_map);
}
/**
@@ -295,7 +325,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
rdt_last_cmd_puts("Cache domain offline\n");
ret = -ENODEV;
} else {
- mask = &rdtgrp->plr->d->cpu_mask;
+ mask = &rdtgrp->plr->d->hdr.cpu_mask;
seq_printf(s, is_cpu_list(of) ?
"%*pbl\n" : "%*pb\n",
cpumask_pr_args(mask));
@@ -559,14 +589,26 @@ static void update_task_closid_rmid(struct task_struct *t)
_update_task_closid_rmid(t);
}
+static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
+{
+ u32 closid, rmid = rdtgrp->mon.rmid;
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ closid = rdtgrp->closid;
+ else if (rdtgrp->type == RDTMON_GROUP)
+ closid = rdtgrp->mon.parent->closid;
+ else
+ return false;
+
+ return resctrl_arch_match_closid(tsk, closid) &&
+ resctrl_arch_match_rmid(tsk, closid, rmid);
+}
+
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
/* If the task is already in rdtgrp, no need to move the task. */
- if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
- tsk->rmid == rdtgrp->mon.rmid) ||
- (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
- tsk->closid == rdtgrp->mon.parent->closid))
+ if (task_in_rdtgroup(tsk, rdtgrp))
return 0;
/*
@@ -577,19 +619,19 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
* For monitor groups, can move the tasks only from
* their parent CTRL group.
*/
-
- if (rdtgrp->type == RDTCTRL_GROUP) {
- WRITE_ONCE(tsk->closid, rdtgrp->closid);
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else if (rdtgrp->type == RDTMON_GROUP) {
- if (rdtgrp->mon.parent->closid == tsk->closid) {
- WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
- } else {
- rdt_last_cmd_puts("Can't move task to different control group\n");
- return -EINVAL;
- }
+ if (rdtgrp->type == RDTMON_GROUP &&
+ !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
}
+ if (rdtgrp->type == RDTMON_GROUP)
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
+ rdtgrp->mon.rmid);
+ else
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
+ rdtgrp->mon.rmid);
+
/*
* Ensure the task's closid and rmid are written before determining if
* the task is current that will decide if it will be interrupted.
@@ -611,14 +653,15 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
{
- return (rdt_alloc_capable &&
- (r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
+ return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
+ resctrl_arch_match_closid(t, r->closid));
}
static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
{
- return (rdt_mon_capable &&
- (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
+ return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
+ resctrl_arch_match_rmid(t, r->mon.parent->closid,
+ r->mon.rmid));
}
/**
@@ -853,7 +896,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
mutex_lock(&rdtgroup_mutex);
/* Return empty if resctrl has not been mounted. */
- if (!static_branch_unlikely(&rdt_enable_key)) {
+ if (!resctrl_mounted) {
seq_puts(s, "res:\nmon:\n");
goto unlock;
}
@@ -869,7 +912,7 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
rdtg->mode != RDT_MODE_EXCLUSIVE)
continue;
- if (rdtg->closid != tsk->closid)
+ if (!resctrl_arch_match_closid(tsk, rdtg->closid))
continue;
seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
@@ -877,7 +920,8 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
seq_puts(s, "mon:");
list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
mon.crdtgrp_list) {
- if (tsk->rmid != crg->mon.rmid)
+ if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
+ crg->mon.rmid))
continue;
seq_printf(s, "%s", crg->kn->name);
break;
@@ -976,20 +1020,21 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
unsigned long sw_shareable = 0, hw_shareable = 0;
unsigned long exclusive = 0, pseudo_locked = 0;
struct rdt_resource *r = s->res;
- struct rdt_domain *dom;
+ struct rdt_ctrl_domain *dom;
int i, hwb, swb, excl, psl;
enum rdtgrp_mode mode;
bool sep = false;
u32 ctrl_val;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
hw_shareable = r->cache.shareable_bits;
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(seq, ';');
sw_shareable = 0;
exclusive = 0;
- seq_printf(seq, "%d=", dom->id);
+ seq_printf(seq, "%d=", dom->hdr.id);
for (i = 0; i < closids_supported(); i++) {
if (!closid_allocated(i))
continue;
@@ -1042,6 +1087,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
}
seq_putc(seq, '\n');
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -1205,7 +1251,7 @@ static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
*
* Return: false if CBM does not overlap, true if it does.
*/
-static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid,
enum resctrl_conf_type type, bool exclusive)
{
@@ -1260,7 +1306,7 @@ static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d
*
* Return: true if CBM overlap detected, false if there is no overlap
*/
-bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
unsigned long cbm, int closid, bool exclusive)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -1291,18 +1337,21 @@ bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_domain *d,
static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
+ struct rdt_ctrl_domain *d;
struct resctrl_schema *s;
struct rdt_resource *r;
bool has_cache = false;
- struct rdt_domain *d;
u32 ctrl;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
continue;
has_cache = true;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
ctrl = resctrl_arch_get_config(r, d, closid,
s->conf_type);
if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
@@ -1407,20 +1456,19 @@ out:
* bitmap functions work correctly.
*/
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
- struct rdt_domain *d, unsigned long cbm)
+ struct rdt_ctrl_domain *d, unsigned long cbm)
{
- struct cpu_cacheinfo *ci;
unsigned int size = 0;
- int num_b, i;
+ struct cacheinfo *ci;
+ int num_b;
+
+ if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
+ return size;
num_b = bitmap_weight(&cbm, r->cache.cbm_len);
- ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
- for (i = 0; i < ci->num_leaves; i++) {
- if (ci->info_list[i].level == r->cache_level) {
- size = ci->info_list[i].size / r->cache.cbm_len * num_b;
- break;
- }
- }
+ ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
+ if (ci)
+ size = ci->size / r->cache.cbm_len * num_b;
return size;
}
@@ -1436,9 +1484,9 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
{
struct resctrl_schema *schema;
enum resctrl_conf_type type;
+ struct rdt_ctrl_domain *d;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
- struct rdt_domain *d;
unsigned int size;
int ret = 0;
u32 closid;
@@ -1462,7 +1510,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
rdtgrp->plr->d,
rdtgrp->plr->cbm);
- seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
}
goto out;
}
@@ -1474,7 +1522,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
type = schema->conf_type;
sep = false;
seq_printf(s, "%*s:", max_name_width, schema->name);
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (sep)
seq_putc(s, ';');
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
@@ -1492,7 +1540,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
else
size = rdtgroup_cbm_to_size(r, d, ctrl);
}
- seq_printf(s, "%d=%u", d->id, size);
+ seq_printf(s, "%d=%u", d->hdr.id, size);
sep = true;
}
seq_putc(s, '\n');
@@ -1550,20 +1598,21 @@ static void mon_event_config_read(void *info)
mon_info->mon_config = msrval & MAX_EVT_CONFIG_BITS;
}
-static void mondata_config_read(struct rdt_domain *d, struct mon_config_info *mon_info)
+static void mondata_config_read(struct rdt_mon_domain *d, struct mon_config_info *mon_info)
{
- smp_call_function_any(&d->cpu_mask, mon_event_config_read, mon_info, 1);
+ smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_read, mon_info, 1);
}
static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
{
- struct mon_config_info mon_info = {0};
- struct rdt_domain *dom;
+ struct mon_config_info mon_info;
+ struct rdt_mon_domain *dom;
bool sep = false;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
- list_for_each_entry(dom, &r->domains, list) {
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
if (sep)
seq_puts(s, ";");
@@ -1571,12 +1620,13 @@ static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid
mon_info.evtid = evtid;
mondata_config_read(dom, &mon_info);
- seq_printf(s, "%d=0x%02x", dom->id, mon_info.mon_config);
+ seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
sep = true;
}
seq_puts(s, "\n");
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return 0;
}
@@ -1614,17 +1664,10 @@ static void mon_event_config_write(void *info)
wrmsr(MSR_IA32_EVT_CFG_BASE + index, mon_info->mon_config, 0);
}
-static int mbm_config_write_domain(struct rdt_resource *r,
- struct rdt_domain *d, u32 evtid, u32 val)
+static void mbm_config_write_domain(struct rdt_resource *r,
+ struct rdt_mon_domain *d, u32 evtid, u32 val)
{
struct mon_config_info mon_info = {0};
- int ret = 0;
-
- /* mon_config cannot be more than the supported set of events */
- if (val > MAX_EVT_CONFIG_BITS) {
- rdt_last_cmd_puts("Invalid event configuration\n");
- return -EINVAL;
- }
/*
* Read the current config value first. If both are the same then
@@ -1633,7 +1676,7 @@ static int mbm_config_write_domain(struct rdt_resource *r,
mon_info.evtid = evtid;
mondata_config_read(d, &mon_info);
if (mon_info.mon_config == val)
- goto out;
+ return;
mon_info.mon_config = val;
@@ -1643,7 +1686,7 @@ static int mbm_config_write_domain(struct rdt_resource *r,
* are scoped at the domain level. Writing any of these MSRs
* on one CPU is observed by all the CPUs in the domain.
*/
- smp_call_function_any(&d->cpu_mask, mon_event_config_write,
+ smp_call_function_any(&d->hdr.cpu_mask, mon_event_config_write,
&mon_info, 1);
/*
@@ -1656,17 +1699,17 @@ static int mbm_config_write_domain(struct rdt_resource *r,
* mbm_local and mbm_total counts for all the RMIDs.
*/
resctrl_arch_reset_rmid_all(r, d);
-
-out:
- return ret;
}
static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
char *dom_str = NULL, *id_str;
unsigned long dom_id, val;
- struct rdt_domain *d;
- int ret = 0;
+ struct rdt_mon_domain *d;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
next:
if (!tok || tok[0] == '\0')
@@ -1686,11 +1729,16 @@ next:
return -EINVAL;
}
- list_for_each_entry(d, &r->domains, list) {
- if (d->id == dom_id) {
- ret = mbm_config_write_domain(r, d, evtid, val);
- if (ret)
- return -EINVAL;
+ /* Value from user cannot be more than the supported set of events */
+ if ((val & hw_res->mbm_cfg_mask) != val) {
+ rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+ hw_res->mbm_cfg_mask);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
+ mbm_config_write_domain(r, d, evtid, val);
goto next;
}
}
@@ -1709,6 +1757,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
@@ -1718,6 +1767,7 @@ static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
@@ -1733,6 +1783,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
if (nbytes == 0 || buf[nbytes - 1] != '\n')
return -EINVAL;
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
rdt_last_cmd_clear();
@@ -1742,6 +1793,7 @@ static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
return ret ?: nbytes;
}
@@ -1899,6 +1951,13 @@ static struct rftype res_common_files[] = {
.fflags = RFTYPE_CTRL_BASE,
},
{
+ .name = "mba_MBps_event",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mba_mbps_event_write,
+ .seq_show = rdtgroup_mba_mbps_event_show,
+ },
+ {
.name = "mode",
.mode = 0644,
.kf_ops = &rdtgroup_kf_single_ops,
@@ -1977,24 +2036,13 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
return NULL;
}
-void __init thread_throttle_mode_init(void)
-{
- struct rftype *rft;
-
- rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
- if (!rft)
- return;
-
- rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
-}
-
-void __init mbm_config_rftype_init(const char *config)
+void resctrl_file_fflags_init(const char *config, unsigned long fflags)
{
struct rftype *rft;
rft = rdtgroup_get_rftype_by_name(config);
if (rft)
- rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
+ rft->fflags = fflags;
}
/**
@@ -2213,11 +2261,14 @@ static inline bool is_mba_linear(void)
static int set_cache_qos_cfg(int level, bool enable)
{
void (*update)(void *arg);
+ struct rdt_ctrl_domain *d;
struct rdt_resource *r_l;
cpumask_var_t cpu_mask;
- struct rdt_domain *d;
int cpu;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
if (level == RDT_RESOURCE_L3)
update = l3_qos_cfg_update;
else if (level == RDT_RESOURCE_L2)
@@ -2229,14 +2280,14 @@ static int set_cache_qos_cfg(int level, bool enable)
return -ENOMEM;
r_l = &rdt_resources_all[level].r_resctrl;
- list_for_each_entry(d, &r_l->domains, list) {
+ list_for_each_entry(d, &r_l->ctrl_domains, hdr.list) {
if (r_l->cache.arch_has_per_cpu_cfg)
/* Pick all the CPUs in the domain instance */
- for_each_cpu(cpu, &d->cpu_mask)
+ for_each_cpu(cpu, &d->hdr.cpu_mask)
cpumask_set_cpu(cpu, cpu_mask);
else
/* Pick one CPU from each domain instance to update MSR */
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ cpumask_set_cpu(cpumask_any(&d->hdr.cpu_mask), cpu_mask);
}
/* Update QOS_CFG MSR on all the CPUs in cpu_mask */
@@ -2262,10 +2313,10 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
-static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
u32 num_closid = resctrl_arch_get_num_closid(r);
- int cpu = cpumask_any(&d->cpu_mask);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
int i;
d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
@@ -2280,7 +2331,7 @@ static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
}
static void mba_sc_domain_destroy(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_ctrl_domain *d)
{
kfree(d->mbps_val);
d->mbps_val = NULL;
@@ -2288,14 +2339,18 @@ static void mba_sc_domain_destroy(struct rdt_resource *r,
/*
* MBA software controller is supported only if
- * MBM is supported and MBA is in linear scale.
+ * MBM is supported and MBA is in linear scale,
+ * and the MBM monitor scope is the same as MBA
+ * control scope.
*/
static bool supports_mba_mbps(void)
{
+ struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
- return (is_mbm_local_enabled() &&
- r->alloc_capable && is_mba_linear());
+ return (is_mbm_enabled() &&
+ r->alloc_capable && is_mba_linear() &&
+ r->ctrl_scope == rmbm->mon_scope);
}
/*
@@ -2306,7 +2361,8 @@ static int set_mba_sc(bool mba_sc)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
u32 num_closid = resctrl_arch_get_num_closid(r);
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
+ unsigned long fflags;
int i;
if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
@@ -2314,11 +2370,16 @@ static int set_mba_sc(bool mba_sc)
r->membw.mba_sc = mba_sc;
- list_for_each_entry(d, &r->domains, list) {
+ rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
for (i = 0; i < num_closid; i++)
d->mbps_val[i] = MBA_MAX_MBPS;
}
+ fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
+ resctrl_file_fflags_init("mba_MBps_event", fflags);
+
return 0;
}
@@ -2417,6 +2478,7 @@ struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
rdtgroup_kn_get(rdtgrp, kn);
+ cpus_read_lock();
mutex_lock(&rdtgroup_mutex);
/* Was this group deleted while we waited? */
@@ -2434,6 +2496,8 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
return;
mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
rdtgroup_kn_put(rdtgrp, kn);
}
@@ -2575,7 +2639,7 @@ static int rdt_get_tree(struct fs_context *fc)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
unsigned long flags = RFTYPE_CTRL_BASE;
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
struct rdt_resource *r;
int ret;
@@ -2584,7 +2648,7 @@ static int rdt_get_tree(struct fs_context *fc)
/*
* resctrl file system can only be mounted once.
*/
- if (static_branch_unlikely(&rdt_enable_key)) {
+ if (resctrl_mounted) {
ret = -EBUSY;
goto out;
}
@@ -2605,7 +2669,7 @@ static int rdt_get_tree(struct fs_context *fc)
closid_init();
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
flags |= RFTYPE_MON;
ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
@@ -2618,7 +2682,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_schemata_free;
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
ret = mongroup_create_dir(rdtgroup_default.kn,
&rdtgroup_default, "mon_groups",
&kn_mongrp);
@@ -2640,18 +2704,19 @@ static int rdt_get_tree(struct fs_context *fc)
if (ret < 0)
goto out_psl;
- if (rdt_alloc_capable)
- static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
- if (rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_mon_enable_key);
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_enable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_enable_mon();
- if (rdt_alloc_capable || rdt_mon_capable)
- static_branch_enable_cpuslocked(&rdt_enable_key);
+ if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
+ resctrl_mounted = true;
if (is_mbm_enabled()) {
r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
- list_for_each_entry(dom, &r->domains, list)
- mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
+ list_for_each_entry(dom, &r->mon_domains, hdr.list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
}
goto out;
@@ -2659,10 +2724,10 @@ static int rdt_get_tree(struct fs_context *fc)
out_psl:
rdt_pseudo_lock_release();
out_mondata:
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
kernfs_remove(kn_mondata);
out_mongrp:
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
kernfs_remove(kn_mongrp);
out_info:
kernfs_remove(kn_info);
@@ -2699,6 +2764,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
+ const char *msg;
int opt;
opt = fs_parse(fc, rdt_fs_parameters, param, &result);
@@ -2713,8 +2779,9 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
+ msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
if (!supports_mba_mbps())
- return -EINVAL;
+ return invalfc(fc, msg);
ctx->enable_mba_mbps = true;
return 0;
case Opt_debug:
@@ -2759,14 +2826,13 @@ static int rdt_init_fs_context(struct fs_context *fc)
static int reset_all_ctrls(struct rdt_resource *r)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- struct rdt_hw_domain *hw_dom;
+ struct rdt_hw_ctrl_domain *hw_dom;
struct msr_param msr_param;
- cpumask_var_t cpu_mask;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int i;
- if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
- return -ENOMEM;
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
msr_param.res = r;
msr_param.low = 0;
@@ -2774,22 +2840,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
/*
* Disable resource control for this resource by setting all
- * CBMs in all domains to the maximum mask value. Pick one CPU
+ * CBMs in all ctrl_domains to the maximum mask value. Pick one CPU
* from each domain to update the MSRs below.
*/
- list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ hw_dom = resctrl_to_arch_ctrl_dom(d);
for (i = 0; i < hw_res->num_closid; i++)
hw_dom->ctrl_val[i] = r->default_ctrl;
+ msr_param.dom = d;
+ smp_call_function_any(&d->hdr.cpu_mask, rdt_ctrl_update, &msr_param, 1);
}
- /* Update CBM on all the CPUs in cpu_mask */
- on_each_cpu_mask(cpu_mask, rdt_ctrl_update, &msr_param, 1);
-
- free_cpumask_var(cpu_mask);
-
return 0;
}
@@ -2810,8 +2872,8 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
- WRITE_ONCE(t->closid, to->closid);
- WRITE_ONCE(t->rmid, to->mon.rmid);
+ resctrl_arch_set_closid_rmid(t, to->closid,
+ to->mon.rmid);
/*
* Order the closid/rmid stores above before the loads
@@ -2842,7 +2904,7 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
head = &rdtgrp->mon.crdtgrp_list;
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
- free_rmid(sentry->mon.rmid);
+ free_rmid(sentry->closid, sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
if (atomic_read(&sentry->waitcount) != 0)
@@ -2882,7 +2944,7 @@ static void rmdir_all_sub(void)
cpumask_or(&rdtgroup_default.cpu_mask,
&rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
@@ -2917,9 +2979,11 @@ static void rdt_kill_sb(struct super_block *sb)
rdtgroup_default.mode = RDT_MODE_SHAREABLE;
schemata_list_destroy();
rdtgroup_destroy_root();
- static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
- static_branch_disable_cpuslocked(&rdt_mon_enable_key);
- static_branch_disable_cpuslocked(&rdt_enable_key);
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_disable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_disable_mon();
+ resctrl_mounted = false;
kernfs_kill_sb(sb);
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
@@ -2953,62 +3017,126 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
return ret;
}
+static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_find_and_get(pkn, name);
+ if (!kn)
+ return;
+ kernfs_put(kn);
+
+ if (kn->dir.subdirs <= 1)
+ kernfs_remove(kn);
+ else
+ kernfs_remove_by_name(kn, subname);
+}
+
/*
* Remove all subdirectories of mon_data of ctrl_mon groups
- * and monitor groups with given domain id.
+ * and monitor groups for the given domain.
+ * Remove files and directories containing "sum" of domain data
+ * when last domain being summed is removed.
*/
static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- unsigned int dom_id)
+ struct rdt_mon_domain *d)
{
struct rdtgroup *prgrp, *crgrp;
+ char subname[32];
+ bool snc_mode;
char name[32];
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ if (snc_mode)
+ sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
+
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
- sprintf(name, "mon_%s_%02d", r->name, dom_id);
- kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
- kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
+ mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
}
}
-static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
- struct rdt_domain *d,
- struct rdt_resource *r, struct rdtgroup *prgrp)
+static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp,
+ bool do_sum)
{
+ struct rmid_read rr = {0};
union mon_data_bits priv;
- struct kernfs_node *kn;
struct mon_evt *mevt;
- struct rmid_read rr;
- char name[32];
int ret;
- sprintf(name, "mon_%s_%02d", r->name, d->id);
- /* create the directory */
- kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
- if (IS_ERR(kn))
- return PTR_ERR(kn);
-
- ret = rdtgroup_kn_set_ugid(kn);
- if (ret)
- goto out_destroy;
-
- if (WARN_ON(list_empty(&r->evt_list))) {
- ret = -EPERM;
- goto out_destroy;
- }
+ if (WARN_ON(list_empty(&r->evt_list)))
+ return -EPERM;
priv.u.rid = r->rid;
- priv.u.domid = d->id;
+ priv.u.domid = do_sum ? d->ci->id : d->hdr.id;
+ priv.u.sum = do_sum;
list_for_each_entry(mevt, &r->evt_list, list) {
priv.u.evtid = mevt->evtid;
ret = mon_addfile(kn, mevt->name, priv.priv);
if (ret)
+ return ret;
+
+ if (!do_sum && is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
+ }
+
+ return 0;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ struct kernfs_node *kn, *ckn;
+ char name[32];
+ bool snc_mode;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
+ kn = kernfs_find_and_get(parent_kn, name);
+ if (kn) {
+ /*
+ * rdtgroup_mutex will prevent this directory from being
+ * removed. No need to keep this hold.
+ */
+ kernfs_put(kn);
+ } else {
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+ ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
+ if (ret)
goto out_destroy;
+ }
- if (is_mbm_event(mevt->evtid))
- mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
+ if (snc_mode) {
+ sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
+ ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(ckn)) {
+ ret = -EINVAL;
+ goto out_destroy;
+ }
+
+ ret = rdtgroup_kn_set_ugid(ckn);
+ if (ret)
+ goto out_destroy;
+
+ ret = mon_add_all_files(ckn, d, r, prgrp, false);
+ if (ret)
+ goto out_destroy;
}
+
kernfs_activate(kn);
return 0;
@@ -3022,7 +3150,7 @@ out_destroy:
* and "monitor" groups with given domain id.
*/
static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d)
+ struct rdt_mon_domain *d)
{
struct kernfs_node *parent_kn;
struct rdtgroup *prgrp, *crgrp;
@@ -3044,10 +3172,13 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
struct rdt_resource *r,
struct rdtgroup *prgrp)
{
- struct rdt_domain *dom;
+ struct rdt_mon_domain *dom;
int ret;
- list_for_each_entry(dom, &r->domains, list) {
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
if (ret)
return ret;
@@ -3146,7 +3277,7 @@ static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
* Set the RDT domain up to start off with all usable allocations. That is,
* all shareable and unused bits. All-zero CBM is invalid.
*/
-static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
+static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
u32 closid)
{
enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
@@ -3206,7 +3337,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
tmp_cbm = cfg->new_ctrl;
if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
- rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->id);
+ rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
return -ENOSPC;
}
cfg->have_new_ctrl = true;
@@ -3226,10 +3357,10 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct resctrl_schema *s,
*/
static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
{
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
int ret;
- list_for_each_entry(d, &s->res->domains, list) {
+ list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
ret = __init_one_rdt_domain(d, s, closid);
if (ret < 0)
return ret;
@@ -3242,9 +3373,9 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
- struct rdt_domain *d;
+ struct rdt_ctrl_domain *d;
- list_for_each_entry(d, &r->domains, list) {
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
if (is_mba_sc(r)) {
d->mbps_val[closid] = MBA_MAX_MBPS;
continue;
@@ -3293,6 +3424,36 @@ out:
return ret;
}
+static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (!resctrl_arch_mon_capable())
+ return 0;
+
+ ret = alloc_rmid(rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
+{
+ if (resctrl_arch_mon_capable())
+ free_rmid(rgrp->closid, rgrp->mon.rmid);
+}
+
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
@@ -3353,7 +3514,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
if (rtype == RDTCTRL_GROUP) {
files = RFTYPE_BASE | RFTYPE_CTRL;
- if (rdt_mon_capable)
+ if (resctrl_arch_mon_capable())
files |= RFTYPE_MON;
} else {
files = RFTYPE_BASE | RFTYPE_MON;
@@ -3365,29 +3526,11 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
goto out_destroy;
}
- if (rdt_mon_capable) {
- ret = alloc_rmid();
- if (ret < 0) {
- rdt_last_cmd_puts("Out of RMIDs\n");
- goto out_destroy;
- }
- rdtgrp->mon.rmid = ret;
-
- ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
- if (ret) {
- rdt_last_cmd_puts("kernfs subdir error\n");
- goto out_idfree;
- }
- }
- kernfs_activate(kn);
-
/*
* The caller unlocks the parent_kn upon success.
*/
return 0;
-out_idfree:
- free_rmid(rdtgrp->mon.rmid);
out_destroy:
kernfs_put(rdtgrp->kn);
kernfs_remove(rdtgrp->kn);
@@ -3401,7 +3544,6 @@ out_unlock:
static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
{
kernfs_remove(rgrp->kn);
- free_rmid(rgrp->mon.rmid);
rdtgroup_remove(rgrp);
}
@@ -3423,12 +3565,21 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
prgrp = rdtgrp->mon.parent;
rdtgrp->closid = prgrp->closid;
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret) {
+ mkdir_rdt_prepare_clean(rdtgrp);
+ goto out_unlock;
+ }
+
+ kernfs_activate(rdtgrp->kn);
+
/*
* Add the rdtgrp to the list of rdtgrps the parent
* ctrl_mon group has to track.
*/
list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+out_unlock:
rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -3459,13 +3610,20 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
ret = 0;
rdtgrp->closid = closid;
+
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret)
+ goto out_closid_free;
+
+ kernfs_activate(rdtgrp->kn);
+
ret = rdtgroup_init_alloc(rdtgrp);
if (ret < 0)
- goto out_id_free;
+ goto out_rmid_free;
list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
- if (rdt_mon_capable) {
+ if (resctrl_arch_mon_capable()) {
/*
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
@@ -3475,13 +3633,17 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
}
+ if (is_mba_sc(NULL))
+ rdtgrp->mba_mbps_event = mba_mbps_default_event;
}
goto out_unlock;
out_del_list:
list_del(&rdtgrp->rdtgroup_list);
-out_id_free:
+out_rmid_free:
+ mkdir_rdt_prepare_rmid_free(rdtgrp);
+out_closid_free:
closid_free(closid);
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
@@ -3518,14 +3680,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* allocation is supported, add a control and monitoring
* subdirectory
*/
- if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
+ if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
- if (rdt_mon_capable && is_mon_groups(parent_kn, name))
+ if (resctrl_arch_mon_capable() && is_mon_groups(parent_kn, name))
return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
@@ -3550,7 +3712,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
update_closid_rmid(tmpmask, NULL);
rdtgrp->flags = RDT_DELETED;
- free_rmid(rdtgrp->mon.rmid);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
/*
* Remove the rdtgrp from the parent ctrl_mon group's list
@@ -3596,8 +3758,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
update_closid_rmid(tmpmask, NULL);
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
closid_free(rdtgrp->closid);
- free_rmid(rdtgrp->mon.rmid);
rdtgroup_ctrl_remove(rdtgrp);
@@ -3829,8 +3991,8 @@ static void __init rdtgroup_setup_default(void)
{
mutex_lock(&rdtgroup_mutex);
- rdtgroup_default.closid = 0;
- rdtgroup_default.mon.rmid = 0;
+ rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
+ rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
rdtgroup_default.type = RDTCTRL_GROUP;
INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
@@ -3839,33 +4001,37 @@ static void __init rdtgroup_setup_default(void)
mutex_unlock(&rdtgroup_mutex);
}
-static void domain_destroy_mon_state(struct rdt_domain *d)
+static void domain_destroy_mon_state(struct rdt_mon_domain *d)
{
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
kfree(d->mbm_local);
}
-void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
mba_sc_domain_destroy(r, d);
- if (!r->mon_capable)
- return;
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ mutex_lock(&rdtgroup_mutex);
/*
* If resctrl is mounted, remove all the
* per domain monitor data directories.
*/
- if (static_branch_unlikely(&rdt_mon_enable_key))
- rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ if (resctrl_mounted && resctrl_arch_mon_capable())
+ rmdir_mondata_subdir_allrdtgrp(r, d);
if (is_mbm_enabled())
cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ if (is_llc_occupancy_enabled() && has_busy_rmid(d)) {
/*
* When a package is going down, forcefully
* decrement rmid->ebusy. There is no way to know
@@ -3879,20 +4045,23 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
}
domain_destroy_mon_state(d);
+
+ mutex_unlock(&rdtgroup_mutex);
}
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
size_t tsize;
if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
+ d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
if (!d->rmid_busy_llc)
return -ENOMEM;
}
if (is_mbm_total_enabled()) {
tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_total) {
bitmap_free(d->rmid_busy_llc);
return -ENOMEM;
@@ -3900,7 +4069,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
}
if (is_mbm_local_enabled()) {
tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
if (!d->mbm_local) {
bitmap_free(d->rmid_busy_llc);
kfree(d->mbm_total);
@@ -3911,36 +4080,106 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
-int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
{
- int err;
+ int err = 0;
- lockdep_assert_held(&rdtgroup_mutex);
+ mutex_lock(&rdtgroup_mutex);
- if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
/* RDT_RESOURCE_MBA is never mon_capable */
- return mba_sc_domain_allocate(r, d);
+ err = mba_sc_domain_allocate(r, d);
+ }
- if (!r->mon_capable)
- return 0;
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ int err;
+
+ mutex_lock(&rdtgroup_mutex);
err = domain_setup_mon_state(r, d);
if (err)
- return err;
+ goto out_unlock;
if (is_mbm_enabled()) {
INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
}
if (is_llc_occupancy_enabled())
INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
- /* If resctrl is mounted, add per domain monitor data directories. */
- if (static_branch_unlikely(&rdt_mon_enable_key))
+ /*
+ * If the filesystem is not mounted then only the default resource group
+ * exists. Creation of its directories is deferred until mount time
+ * by rdt_get_tree() calling mkdir_mondata_all().
+ * If resctrl is mounted, add per domain monitor data directories.
+ */
+ if (resctrl_mounted && resctrl_arch_mon_capable())
mkdir_mondata_subdir_allrdtgrp(r, d);
- return 0;
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+void resctrl_online_cpu(unsigned int cpu)
+{
+ mutex_lock(&rdtgroup_mutex);
+ /* The CPU is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
+ break;
+ }
+}
+
+void resctrl_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *l3 = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ struct rdt_mon_domain *d;
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
+ break;
+ }
+ }
+
+ if (!l3->mon_capable)
+ goto out_unlock;
+
+ d = get_mon_domain_from_cpu(cpu, l3);
+ if (d) {
+ if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0, cpu);
+ }
+ if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
+ has_busy_rmid(d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0, cpu);
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
}
/*
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h b/arch/x86/kernel/cpu/resctrl/trace.h
index 428ebbd4270b..2a506316b303 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/resctrl/trace.h
@@ -2,8 +2,8 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM resctrl
-#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_PSEUDO_LOCK_H
+#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RESCTRL_H
#include <linux/tracepoint.h>
@@ -35,9 +35,25 @@ TRACE_EVENT(pseudo_lock_l3,
TP_printk("hits=%llu miss=%llu",
__entry->l3_hits, __entry->l3_miss));
-#endif /* _TRACE_PSEUDO_LOCK_H */
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _TRACE_RESCTRL_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE pseudo_lock_event
+#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 0dad49a09b7a..16f3ca30626a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -24,31 +24,36 @@ struct cpuid_bit {
* levels are different and there is a separate entry for each.
*/
static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
- { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
- { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
- { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
- { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
- { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
- { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
- { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
- { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
- { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
- { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
- { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
- { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
- { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
- { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
- { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
- { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
- { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
- { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
- { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 },
+ { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 },
+ { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 },
+ { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
+ { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
+ { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
+ { X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
+ { X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
+ { X86_FEATURE_SGX_EDECCSSA, CPUID_EAX, 11, 0x00000012, 0 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 },
+ { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
+ { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 },
+ { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 },
+ { X86_FEATURE_AMD_WORKLOAD_CLASS, CPUID_EAX, 22, 0x80000021, 0 },
+ { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 },
+ { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74..7f8d1e11dbee 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
if (flags & MAP_FIXED)
return addr;
- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+ return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#ifdef CONFIG_COMPAT
@@ -150,13 +150,15 @@ int __init sgx_drv_init(void)
u64 xfrm_mask;
int ret;
- if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ pr_info("SGX disabled: SGX launch control CPU feature is not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
+ }
cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
if (!(eax & 1)) {
- pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ pr_info("SGX disabled: SGX1 instruction support not available, /dev/sgx_enclave disabled.\n");
return -ENODEV;
}
@@ -173,8 +175,10 @@ int __init sgx_drv_init(void)
}
ret = misc_register(&sgx_dev_enclave);
- if (ret)
+ if (ret) {
+ pr_info("SGX disabled: Unable to register the /dev/sgx_enclave driver (%d).\n", ret);
return ret;
+ }
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 5d390df21440..776a20172867 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -64,6 +64,13 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
struct file *backing;
long ret;
+ /*
+ * ECREATE would detect this too, but checking here also ensures
+ * that the 'encl_size' calculations below can never overflow.
+ */
+ if (!is_power_of_2(secs->size))
+ return -EINVAL;
+
va_page = sgx_encl_grow(encl, true);
if (IS_ERR(va_page))
return PTR_ERR(va_page);
@@ -581,7 +588,7 @@ err_out:
*
* Flush any outstanding enqueued EADD operations and perform EINIT. The
* Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
- * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ * the enclave's MRSIGNER, which is calculated from the provided sigstruct.
*
* Return:
* - 0: Success.
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..8ce352fc72ac 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,7 @@
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
@@ -474,24 +475,25 @@ struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
struct sgx_epc_page *page;
int nid_of_current = numa_node_id();
- int nid = nid_of_current;
+ int nid_start, nid;
- if (node_isset(nid_of_current, sgx_numa_mask)) {
- page = __sgx_alloc_epc_page_from_node(nid_of_current);
- if (page)
- return page;
- }
-
- /* Fall back to the non-local NUMA nodes: */
- while (true) {
- nid = next_node_in(nid, sgx_numa_mask);
- if (nid == nid_of_current)
- break;
+ /*
+ * Try local node first. If it doesn't have an EPC section,
+ * fall back to the non-local NUMA nodes.
+ */
+ if (node_isset(nid_of_current, sgx_numa_mask))
+ nid_start = nid_of_current;
+ else
+ nid_start = next_node_in(nid_of_current, sgx_numa_mask);
+ nid = nid_start;
+ do {
page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
- }
+
+ nid = next_node_in(nid, sgx_numa_mask);
+ } while (nid != nid_start);
return ERR_PTR(-ENOMEM);
}
@@ -628,7 +630,7 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
if (!section->virt_addr)
return false;
- section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ section->pages = vmalloc_array(nr_pages, sizeof(struct sgx_epc_page));
if (!section->pages) {
memunmap(section->virt_addr);
return false;
@@ -731,7 +733,7 @@ out:
return 0;
}
-/**
+/*
* A section metric is concatenated in a way that @low bits 12-31 define the
* bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
* metric.
@@ -846,6 +848,13 @@ static bool __init sgx_page_cache_init(void)
return false;
}
+ for_each_online_node(nid) {
+ if (!node_isset(nid, sgx_numa_mask) &&
+ node_state(nid, N_MEMORY) && node_state(nid, N_CPU))
+ pr_info("node%d has both CPUs and memory but doesn't have an EPC section\n",
+ nid);
+ }
+
return true;
}
@@ -892,19 +901,15 @@ static struct miscdevice sgx_dev_provision = {
int sgx_set_attribute(unsigned long *allowed_attributes,
unsigned int attribute_fd)
{
- struct fd f = fdget(attribute_fd);
+ CLASS(fd, f)(attribute_fd);
- if (!f.file)
+ if (fd_empty(f))
return -EINVAL;
- if (f.file->f_op != &sgx_provision_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &sgx_provision_fops)
return -EINVAL;
- }
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
-
- fdput(f);
return 0;
}
EXPORT_SYMBOL_GPL(sgx_set_attribute);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index dc136703566f..01456236a6dd 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,167 +1,571 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * Check for extended topology enumeration cpuid leaf 0xb and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
+ * CPU/APIC topology
+ *
+ * The APIC IDs describe the system topology in multiple domain levels.
+ * The CPUID topology parser provides the information which part of the
+ * APIC ID is associated to the individual levels:
+ *
+ * [PACKAGE][DIEGRP][DIE][TILE][MODULE][CORE][THREAD]
+ *
+ * The root space contains the package (socket) IDs.
+ *
+ * Not enumerated levels consume 0 bits space, but conceptually they are
+ * always represented. If e.g. only CORE and THREAD levels are enumerated
+ * then the DIE, MODULE and TILE have the same physical ID as the PACKAGE.
+ *
+ * If SMT is not supported, then the THREAD domain is still used. It then
+ * has the same physical ID as the CORE domain and is the only child of
+ * the core domain.
+ *
+ * This allows a unified view on the system independent of the enumerated
+ * domain levels without requiring any conditionals in the code.
*/
-
+#define pr_fmt(fmt) "CPU topo: " fmt
#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
#include <asm/apic.h>
-#include <asm/memtype.h>
-#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/io_apic.h>
+#include <asm/mpspec.h>
+#include <asm/smp.h>
#include "cpu.h"
-/* leaf 0xb SMT level */
-#define SMT_LEVEL 0
+/*
+ * Map cpu index to physical APIC ID
+ */
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_apicid, BAD_APICID);
+DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, CPU_ACPIID_INVALID);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid);
-/* extended topology sub-leaf types */
-#define INVALID_TYPE 0
-#define SMT_TYPE 1
-#define CORE_TYPE 2
-#define DIE_TYPE 5
+/* Bitmap of physically present CPUs. */
+DECLARE_BITMAP(phys_cpu_present_map, MAX_LOCAL_APIC) __read_mostly;
-#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
-#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
-#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
+/* Used for CPU number allocation and parallel CPU bringup */
+u32 cpuid_to_apicid[] __ro_after_init = { [0 ... NR_CPUS - 1] = BAD_APICID, };
-unsigned int __max_die_per_package __read_mostly = 1;
-EXPORT_SYMBOL(__max_die_per_package);
+/* Bitmaps to mark registered APICs at each topology domain */
+static struct { DECLARE_BITMAP(map, MAX_LOCAL_APIC); } apic_maps[TOPO_MAX_DOMAIN] __ro_after_init;
-#ifdef CONFIG_SMP
/*
- * Check if given CPUID extended topology "leaf" is implemented
+ * Keep track of assigned, disabled and rejected CPUs. Present assigned
+ * with 1 as CPU #0 is reserved for the boot CPU.
*/
-static int check_extended_topology_leaf(int leaf)
-{
- unsigned int eax, ebx, ecx, edx;
+static struct {
+ unsigned int nr_assigned_cpus;
+ unsigned int nr_disabled_cpus;
+ unsigned int nr_rejected_cpus;
+ u32 boot_cpu_apic_id;
+ u32 real_bsp_apic_id;
+} topo_info __ro_after_init = {
+ .nr_assigned_cpus = 1,
+ .boot_cpu_apic_id = BAD_APICID,
+ .real_bsp_apic_id = BAD_APICID,
+};
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+#define domain_weight(_dom) bitmap_weight(apic_maps[_dom].map, MAX_LOCAL_APIC)
- if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
- return -1;
+bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
+{
+ return phys_id == (u64)cpuid_to_apicid[cpu];
+}
- return 0;
+#ifdef CONFIG_SMP
+static void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid)
+{
+ if (!(apicid & (__max_threads_per_core - 1)))
+ cpumask_set_cpu(cpu, &__cpu_primary_thread_mask);
}
+#else
+static inline void cpu_mark_primary_thread(unsigned int cpu, unsigned int apicid) { }
+#endif
+
/*
- * Return best CPUID Extended Topology Leaf supported
+ * Convert the APIC ID to a domain level ID by masking out the low bits
+ * below the domain level @dom.
*/
-static int detect_extended_topology_leaf(struct cpuinfo_x86 *c)
+static inline u32 topo_apicid(u32 apicid, enum x86_topology_domains dom)
{
- if (c->cpuid_level >= 0x1f) {
- if (check_extended_topology_leaf(0x1f) == 0)
- return 0x1f;
- }
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid & (UINT_MAX << x86_topo_system.dom_shifts[dom - 1]);
+}
- if (c->cpuid_level >= 0xb) {
- if (check_extended_topology_leaf(0xb) == 0)
- return 0xb;
- }
+static int topo_lookup_cpuid(u32 apic_id)
+{
+ int i;
- return -1;
+ /* CPU# to APICID mapping is persistent once it is established */
+ for (i = 0; i < topo_info.nr_assigned_cpus; i++) {
+ if (cpuid_to_apicid[i] == apic_id)
+ return i;
+ }
+ return -ENODEV;
}
-#endif
-int detect_extended_topology_early(struct cpuinfo_x86 *c)
+static __init int topo_get_cpunr(u32 apic_id)
{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx;
- int leaf;
+ int cpu = topo_lookup_cpuid(apic_id);
- leaf = detect_extended_topology_leaf(c);
- if (leaf < 0)
- return -1;
+ if (cpu >= 0)
+ return cpu;
- set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+ return topo_info.nr_assigned_cpus++;
+}
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- /*
- * initial apic id, which also represents 32-bit extended x2apic id.
- */
- c->topo.initial_apicid = edx;
- smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
+static void topo_set_cpuids(unsigned int cpu, u32 apic_id, u32 acpi_id)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
+ early_per_cpu(x86_cpu_to_apicid, cpu) = apic_id;
+ early_per_cpu(x86_cpu_to_acpiid, cpu) = acpi_id;
#endif
- return 0;
+ set_cpu_present(cpu, true);
}
-/*
- * Check for extended topology enumeration cpuid leaf, and if it
- * exists, use it for populating initial_apicid and cpu topology
- * detection.
- */
-int detect_extended_topology(struct cpuinfo_x86 *c)
+static __init bool check_for_real_bsp(u32 apic_id)
{
-#ifdef CONFIG_SMP
- unsigned int eax, ebx, ecx, edx, sub_index;
- unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width;
- unsigned int core_select_mask, core_level_siblings;
- unsigned int die_select_mask, die_level_siblings;
- unsigned int pkg_mask_width;
- bool die_level_present = false;
- int leaf;
-
- leaf = detect_extended_topology_leaf(c);
- if (leaf < 0)
- return -1;
+ bool is_bsp = false, has_apic_base = boot_cpu_data.x86 >= 6;
+ u64 msr;
/*
- * Populate HT related information from sub-leaf level 0.
+ * There is no real good way to detect whether this a kdump()
+ * kernel, but except on the Voyager SMP monstrosity which is not
+ * longer supported, the real BSP APIC ID is the first one which is
+ * enumerated by firmware. That allows to detect whether the boot
+ * CPU is the real BSP. If it is not, then do not register the APIC
+ * because sending INIT to the real BSP would reset the whole
+ * system.
+ *
+ * The first APIC ID which is enumerated by firmware is detectable
+ * because the boot CPU APIC ID is registered before that without
+ * invoking this code.
*/
- cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
- c->topo.initial_apicid = edx;
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- smp_num_siblings = max_t(int, smp_num_siblings, LEVEL_MAX_SIBLINGS(ebx));
- core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
-
- sub_index = 1;
- while (true) {
- cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx);
+ if (topo_info.real_bsp_apic_id != BAD_APICID)
+ return false;
+ /*
+ * Check whether the enumeration order is broken by evaluating the
+ * BSP bit in the APICBASE MSR. If the CPU does not have the
+ * APICBASE MSR then the BSP detection is not possible and the
+ * kernel must rely on the firmware enumeration order.
+ */
+ if (has_apic_base) {
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ is_bsp = !!(msr & MSR_IA32_APICBASE_BSP);
+ }
+
+ if (apic_id == topo_info.boot_cpu_apic_id) {
/*
- * Check for the Core type in the implemented sub leaves.
+ * If the boot CPU has the APIC BSP bit set then the
+ * firmware enumeration is agreeing. If the CPU does not
+ * have the APICBASE MSR then the only choice is to trust
+ * the enumeration order.
*/
- if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
- core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- die_level_siblings = core_level_siblings;
- die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- }
- if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) {
- die_level_present = true;
- die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
- die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+ if (is_bsp || !has_apic_base) {
+ topo_info.real_bsp_apic_id = apic_id;
+ return false;
}
+ /*
+ * If the boot APIC is enumerated first, but the APICBASE
+ * MSR does not have the BSP bit set, then there is no way
+ * to discover the real BSP here. Assume a crash kernel and
+ * limit the number of CPUs to 1 as an INIT to the real BSP
+ * would reset the machine.
+ */
+ pr_warn("Enumerated BSP APIC %x is not marked in APICBASE MSR\n", apic_id);
+ pr_warn("Assuming crash kernel. Limiting to one CPU to prevent machine INIT\n");
+ set_nr_cpu_ids(1);
+ goto fwbug;
+ }
- if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE)
- pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
- else
- break;
+ pr_warn("Boot CPU APIC ID not the first enumerated APIC ID: %x != %x\n",
+ topo_info.boot_cpu_apic_id, apic_id);
- sub_index++;
+ if (is_bsp) {
+ /*
+ * The boot CPU has the APIC BSP bit set. Use it and complain
+ * about the broken firmware enumeration.
+ */
+ topo_info.real_bsp_apic_id = topo_info.boot_cpu_apic_id;
+ goto fwbug;
}
- core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width;
- die_select_mask = (~(-1 << die_plus_mask_width)) >>
- core_plus_mask_width;
+ pr_warn("Crash kernel detected. Disabling real BSP to prevent machine INIT\n");
+
+ topo_info.real_bsp_apic_id = apic_id;
+ return true;
+
+fwbug:
+ pr_warn(FW_BUG "APIC enumeration order not specification compliant\n");
+ return false;
+}
+
+static unsigned int topo_unit_count(u32 lvlid, enum x86_topology_domains at_level,
+ unsigned long *map)
+{
+ unsigned int id, end, cnt = 0;
+
+ /* Calculate the exclusive end */
+ end = lvlid + (1U << x86_topo_system.dom_shifts[at_level]);
+
+ /* Unfortunately there is no bitmap_weight_range() */
+ for (id = find_next_bit(map, end, lvlid); id < end; id = find_next_bit(map, end, ++id))
+ cnt++;
+ return cnt;
+}
+
+static __init void topo_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ int cpu, dom;
+
+ if (present) {
+ set_bit(apic_id, phys_cpu_present_map);
+
+ /*
+ * Double registration is valid in case of the boot CPU
+ * APIC because that is registered before the enumeration
+ * of the APICs via firmware parsers or VM guest
+ * mechanisms.
+ */
+ if (apic_id == topo_info.boot_cpu_apic_id)
+ cpu = 0;
+ else
+ cpu = topo_get_cpunr(apic_id);
+
+ cpuid_to_apicid[cpu] = apic_id;
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ } else {
+ u32 pkgid = topo_apicid(apic_id, TOPO_PKG_DOMAIN);
- c->topo.core_id = apic->phys_pkg_id(c->topo.initial_apicid,
- ht_mask_width) & core_select_mask;
+ /*
+ * Check for present APICs in the same package when running
+ * on bare metal. Allow the bogosity in a guest.
+ */
+ if (hypervisor_is_type(X86_HYPER_NATIVE) &&
+ topo_unit_count(pkgid, TOPO_PKG_DOMAIN, phys_cpu_present_map)) {
+ pr_info_once("Ignoring hot-pluggable APIC ID %x in present package.\n",
+ apic_id);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
- if (die_level_present) {
- c->topo.die_id = apic->phys_pkg_id(c->topo.initial_apicid,
- core_plus_mask_width) & die_select_mask;
+ topo_info.nr_disabled_cpus++;
}
- c->topo.pkg_id = apic->phys_pkg_id(c->topo.initial_apicid, pkg_mask_width);
/*
- * Reinit the apicid, now that we have extended initial_apicid.
+ * Register present and possible CPUs in the domain
+ * maps. cpu_possible_map will be updated in
+ * topology_init_possible_cpus() after enumeration is done.
*/
- c->topo.apicid = apic->phys_pkg_id(c->topo.initial_apicid, 0);
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++)
+ set_bit(topo_apicid(apic_id, dom), apic_maps[dom].map);
+}
+
+/**
+ * topology_register_apic - Register an APIC in early topology maps
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ * @present: True if the corresponding CPU is present
+ */
+void __init topology_register_apic(u32 apic_id, u32 acpi_id, bool present)
+{
+ if (apic_id >= MAX_LOCAL_APIC) {
+ pr_err_once("APIC ID %x exceeds kernel limit of: %x\n", apic_id, MAX_LOCAL_APIC - 1);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ if (check_for_real_bsp(apic_id)) {
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ /* CPU numbers exhausted? */
+ if (apic_id != topo_info.boot_cpu_apic_id && topo_info.nr_assigned_cpus >= nr_cpu_ids) {
+ pr_warn_once("CPU limit of %d reached. Ignoring further CPUs\n", nr_cpu_ids);
+ topo_info.nr_rejected_cpus++;
+ return;
+ }
+
+ topo_register_apic(apic_id, acpi_id, present);
+}
+
+/**
+ * topology_register_boot_apic - Register the boot CPU APIC
+ * @apic_id: The APIC ID to set up
+ *
+ * Separate so CPU #0 can be assigned
+ */
+void __init topology_register_boot_apic(u32 apic_id)
+{
+ WARN_ON_ONCE(topo_info.boot_cpu_apic_id != BAD_APICID);
+
+ topo_info.boot_cpu_apic_id = apic_id;
+ topo_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+}
- c->x86_max_cores = (core_level_siblings / smp_num_siblings);
- __max_die_per_package = (die_level_siblings / core_level_siblings);
+/**
+ * topology_get_logical_id - Retrieve the logical ID at a given topology domain level
+ * @apicid: The APIC ID for which to lookup the logical ID
+ * @at_level: The topology domain level to use
+ *
+ * @apicid must be a full APIC ID, not the normalized variant. It's valid to have
+ * all bits below the domain level specified by @at_level to be clear. So both
+ * real APIC IDs and backshifted normalized APIC IDs work correctly.
+ *
+ * Returns:
+ * - >= 0: The requested logical ID
+ * - -ERANGE: @apicid is out of range
+ * - -ENODEV: @apicid is not registered
+ */
+int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return -ERANGE;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return -ENODEV;
+ /* Get the number of set bits before @lvlid. */
+ return bitmap_weight(apic_maps[at_level].map, lvlid);
+}
+EXPORT_SYMBOL_GPL(topology_get_logical_id);
+
+/**
+ * topology_unit_count - Retrieve the count of specified units at a given topology domain level
+ * @apicid: The APIC ID which specifies the search range
+ * @which_units: The domain level specifying the units to count
+ * @at_level: The domain level at which @which_units have to be counted
+ *
+ * This returns the number of possible units according to the enumerated
+ * information.
+ *
+ * E.g. topology_count_units(apicid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN)
+ * counts the number of possible cores in the package to which @apicid
+ * belongs.
+ *
+ * @at_level must obviously be greater than @which_level to produce useful
+ * results. If @at_level is equal to @which_units the result is
+ * unsurprisingly 1. If @at_level is less than @which_units the results
+ * is by definition undefined and the function returns 0.
+ */
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ /* Remove the bits below @at_level to get the proper level ID of @apicid */
+ unsigned int lvlid = topo_apicid(apicid, at_level);
+
+ if (lvlid >= MAX_LOCAL_APIC)
+ return 0;
+ if (!test_bit(lvlid, apic_maps[at_level].map))
+ return 0;
+ if (which_units > at_level)
+ return 0;
+ if (which_units == at_level)
+ return 1;
+ return topo_unit_count(lvlid, at_level, apic_maps[which_units].map);
+}
+
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+/**
+ * topology_hotplug_apic - Handle a physical hotplugged APIC after boot
+ * @apic_id: The APIC ID to set up
+ * @acpi_id: The ACPI ID associated to the APIC
+ */
+int topology_hotplug_apic(u32 apic_id, u32 acpi_id)
+{
+ int cpu;
+
+ if (apic_id >= MAX_LOCAL_APIC)
+ return -EINVAL;
+
+ /* Reject if the APIC ID was not registered during enumeration. */
+ if (!test_bit(apic_id, apic_maps[TOPO_SMT_DOMAIN].map))
+ return -ENODEV;
+
+ cpu = topo_lookup_cpuid(apic_id);
+ if (cpu < 0)
+ return -ENOSPC;
+
+ set_bit(apic_id, phys_cpu_present_map);
+ topo_set_cpuids(cpu, apic_id, acpi_id);
+ cpu_mark_primary_thread(cpu, apic_id);
+ return cpu;
+}
+
+/**
+ * topology_hotunplug_apic - Remove a physical hotplugged APIC after boot
+ * @cpu: The CPU number for which the APIC ID is removed
+ */
+void topology_hotunplug_apic(unsigned int cpu)
+{
+ u32 apic_id = cpuid_to_apicid[cpu];
+
+ if (apic_id == BAD_APICID)
+ return;
+
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
+ clear_bit(apic_id, phys_cpu_present_map);
+ set_cpu_present(cpu, false);
+}
#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned int max_possible_cpus __initdata = NR_CPUS;
+
+/**
+ * topology_apply_cmdline_limits_early - Apply topology command line limits early
+ *
+ * Ensure that command line limits are in effect before firmware parsing
+ * takes place.
+ */
+void __init topology_apply_cmdline_limits_early(void)
+{
+ unsigned int possible = nr_cpu_ids;
+
+ /* 'maxcpus=0' 'nosmp' 'nolapic' */
+ if (!setup_max_cpus || apic_is_disabled)
+ possible = 1;
+
+ /* 'possible_cpus=N' */
+ possible = min_t(unsigned int, max_possible_cpus, possible);
+
+ if (possible < nr_cpu_ids) {
+ pr_info("Limiting to %u possible CPUs\n", possible);
+ set_nr_cpu_ids(possible);
+ }
+}
+
+static __init bool restrict_to_up(void)
+{
+ if (!smp_found_config)
+ return true;
+ /*
+ * XEN PV is special as it does not advertise the local APIC
+ * properly, but provides a fake topology for it so that the
+ * infrastructure works. So don't apply the restrictions vs. APIC
+ * here.
+ */
+ if (xen_pv_domain())
+ return false;
+
+ return apic_is_disabled;
+}
+
+void __init topology_init_possible_cpus(void)
+{
+ unsigned int assigned = topo_info.nr_assigned_cpus;
+ unsigned int disabled = topo_info.nr_disabled_cpus;
+ unsigned int cnta, cntb, cpu, allowed = 1;
+ unsigned int total = assigned + disabled;
+ u32 apicid, firstid;
+
+ /*
+ * If there was no APIC registered, then fake one so that the
+ * topology bitmap is populated. That ensures that the code below
+ * is valid and the various query interfaces can be used
+ * unconditionally. This does not affect the actual APIC code in
+ * any way because either the local APIC address has not been
+ * registered or the local APIC was disabled on the command line.
+ */
+ if (topo_info.boot_cpu_apic_id == BAD_APICID)
+ topology_register_boot_apic(0);
+
+ if (!restrict_to_up()) {
+ if (WARN_ON_ONCE(assigned > nr_cpu_ids)) {
+ disabled += assigned - nr_cpu_ids;
+ assigned = nr_cpu_ids;
+ }
+ allowed = min_t(unsigned int, total, nr_cpu_ids);
+ }
+
+ if (total > allowed)
+ pr_warn("%u possible CPUs exceed the limit of %u\n", total, allowed);
+
+ assigned = min_t(unsigned int, allowed, assigned);
+ disabled = allowed - assigned;
+
+ topo_info.nr_assigned_cpus = assigned;
+ topo_info.nr_disabled_cpus = disabled;
+
+ total_cpus = allowed;
+ set_nr_cpu_ids(allowed);
+
+ cnta = domain_weight(TOPO_PKG_DOMAIN);
+ cntb = domain_weight(TOPO_DIE_DOMAIN);
+ __max_logical_packages = cnta;
+ __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta));
+
+ pr_info("Max. logical packages: %3u\n", cnta);
+ pr_info("Max. logical dies: %3u\n", cntb);
+ pr_info("Max. dies per package: %3u\n", __max_dies_per_package);
+
+ cnta = domain_weight(TOPO_CORE_DOMAIN);
+ cntb = domain_weight(TOPO_SMT_DOMAIN);
+ /*
+ * Can't use order delta here as order(cnta) can be equal
+ * order(cntb) even if cnta != cntb.
+ */
+ __max_threads_per_core = DIV_ROUND_UP(cntb, cnta);
+ pr_info("Max. threads per core: %3u\n", __max_threads_per_core);
+
+ firstid = find_first_bit(apic_maps[TOPO_SMT_DOMAIN].map, MAX_LOCAL_APIC);
+ __num_cores_per_package = topology_unit_count(firstid, TOPO_CORE_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. cores per package: %3u\n", __num_cores_per_package);
+ __num_threads_per_package = topology_unit_count(firstid, TOPO_SMT_DOMAIN, TOPO_PKG_DOMAIN);
+ pr_info("Num. threads per package: %3u\n", __num_threads_per_package);
+
+ pr_info("Allowing %u present CPUs plus %u hotplug CPUs\n", assigned, disabled);
+ if (topo_info.nr_rejected_cpus)
+ pr_info("Rejected CPUs %u\n", topo_info.nr_rejected_cpus);
+
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ /* Assign CPU numbers to non-present CPUs */
+ for (apicid = 0; disabled; disabled--, apicid++) {
+ apicid = find_next_andnot_bit(apic_maps[TOPO_SMT_DOMAIN].map, phys_cpu_present_map,
+ MAX_LOCAL_APIC, apicid);
+ if (apicid >= MAX_LOCAL_APIC)
+ break;
+ cpuid_to_apicid[topo_info.nr_assigned_cpus++] = apicid;
+ }
+
+ for (cpu = 0; cpu < allowed; cpu++) {
+ apicid = cpuid_to_apicid[cpu];
+
+ set_cpu_possible(cpu, true);
+
+ if (apicid == BAD_APICID)
+ continue;
+
+ cpu_mark_primary_thread(cpu, apicid);
+ set_cpu_present(cpu, test_bit(apicid, phys_cpu_present_map));
+ }
+}
+
+/*
+ * Late SMP disable after sizing CPU masks when APIC/IOAPIC setup failed.
+ */
+void __init topology_reset_possible_cpus_up(void)
+{
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
+
+ bitmap_zero(phys_cpu_present_map, MAX_LOCAL_APIC);
+ if (topo_info.boot_cpu_apic_id != BAD_APICID)
+ set_bit(topo_info.boot_cpu_apic_id, phys_cpu_present_map);
+}
+
+static int __init setup_possible_cpus(char *str)
+{
+ get_option(&str, &max_possible_cpus);
return 0;
}
+early_param("possible_cpus", setup_possible_cpus);
+#endif
diff --git a/arch/x86/kernel/cpu/topology.h b/arch/x86/kernel/cpu/topology.h
new file mode 100644
index 000000000000..37326297f80c
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_TOPOLOGY_H
+#define ARCH_X86_TOPOLOGY_H
+
+struct topo_scan {
+ struct cpuinfo_x86 *c;
+ unsigned int dom_shifts[TOPO_MAX_DOMAIN];
+ unsigned int dom_ncpus[TOPO_MAX_DOMAIN];
+
+ /* Legacy CPUID[1]:EBX[23:16] number of logical processors */
+ unsigned int ebx1_nproc_shift;
+
+ /* AMD specific node ID which cannot be mapped into APIC space. */
+ u16 amd_nodes_per_pkg;
+ u16 amd_node_id;
+};
+
+void cpu_init_topology(struct cpuinfo_x86 *c);
+void cpu_parse_topology(struct cpuinfo_x86 *c);
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus);
+bool cpu_parse_topology_ext(struct topo_scan *tscan);
+void cpu_parse_topology_amd(struct topo_scan *tscan);
+void cpu_topology_fixup_amd(struct topo_scan *tscan);
+
+static inline u32 topo_shift_apicid(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom == TOPO_SMT_DOMAIN)
+ return apicid;
+ return apicid >> x86_topo_system.dom_shifts[dom - 1];
+}
+
+static inline u32 topo_relative_domain_id(u32 apicid, enum x86_topology_domains dom)
+{
+ if (dom != TOPO_SMT_DOMAIN)
+ apicid >>= x86_topo_system.dom_shifts[dom - 1];
+ return apicid & (x86_topo_system.dom_size[dom] - 1);
+}
+
+static inline u32 topo_domain_mask(enum x86_topology_domains dom)
+{
+ return (1U << x86_topo_system.dom_shifts[dom]) - 1;
+}
+
+/*
+ * Update a domain level after the fact without propagating. Used to fixup
+ * broken CPUID enumerations.
+ */
+static inline void topology_update_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ tscan->dom_shifts[dom] = shift;
+ tscan->dom_ncpus[dom] = ncpus;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level);
+#else
+static inline unsigned int topology_unit_count(u32 apicid, enum x86_topology_domains which_units,
+ enum x86_topology_domains at_level)
+{
+ return 1;
+}
+#endif
+
+#endif /* ARCH_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c
new file mode 100644
index 000000000000..03b3c9c3a45e
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_amd.c
@@ -0,0 +1,220 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static bool parse_8000_0008(struct topo_scan *tscan)
+{
+ struct {
+ // ecx
+ u32 cpu_nthreads : 8, // Number of physical threads - 1
+ : 4, // Reserved
+ apicid_coreid_len : 4, // Number of thread core ID bits (shift) in APIC ID
+ perf_tsc_len : 2, // Performance time-stamp counter size
+ : 14; // Reserved
+ } ecx;
+ unsigned int sft;
+
+ if (tscan->c->extended_cpuid_level < 0x80000008)
+ return false;
+
+ cpuid_leaf_reg(0x80000008, CPUID_ECX, &ecx);
+
+ /* If the thread bits are 0, then get the shift value from ecx.cpu_nthreads */
+ sft = ecx.apicid_coreid_len;
+ if (!sft)
+ sft = get_count_order(ecx.cpu_nthreads + 1);
+
+ /*
+ * cpu_nthreads describes the number of threads in the package
+ * sft is the number of APIC ID bits per package
+ *
+ * As the number of actual threads per core is not described in
+ * this leaf, just set the CORE domain shift and let the later
+ * parsers set SMT shift. Assume one thread per core by default
+ * which is correct if there are no other CPUID leafs to parse.
+ */
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, sft, ecx.cpu_nthreads + 1);
+ return true;
+}
+
+static void store_node(struct topo_scan *tscan, u16 nr_nodes, u16 node_id)
+{
+ /*
+ * Starting with Fam 17h the DIE domain could probably be used to
+ * retrieve the node info on AMD/HYGON. Analysis of CPUID dumps
+ * suggests it's the topmost bit(s) of the CPU cores area, but
+ * that's guess work and neither enumerated nor documented.
+ *
+ * Up to Fam 16h this does not work at all and the legacy node ID
+ * has to be used.
+ */
+ tscan->amd_nodes_per_pkg = nr_nodes;
+ tscan->amd_node_id = node_id;
+}
+
+static bool parse_8000_001e(struct topo_scan *tscan, bool has_topoext)
+{
+ struct {
+ // eax
+ u32 ext_apic_id : 32; // Extended APIC ID
+ // ebx
+ u32 core_id : 8, // Unique per-socket logical core unit ID
+ core_nthreads : 8, // #Threads per core (zero-based)
+ : 16; // Reserved
+ // ecx
+ u32 node_id : 8, // Node (die) ID of invoking logical CPU
+ nnodes_per_socket : 3, // #nodes in invoking logical CPU's package/socket
+ : 21; // Reserved
+ // edx
+ u32 : 32; // Reserved
+ } leaf;
+
+ if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
+ return false;
+
+ cpuid_leaf(0x8000001e, &leaf);
+
+ tscan->c->topo.initial_apicid = leaf.ext_apic_id;
+
+ /*
+ * If leaf 0xb is available, then the domain shifts are set
+ * already and nothing to do here. Only valid for family >= 0x17.
+ */
+ if (!has_topoext && tscan->c->x86 >= 0x17) {
+ /*
+ * Leaf 0x80000008 set the CORE domain shift already.
+ * Update the SMT domain, but do not propagate it.
+ */
+ unsigned int nthreads = leaf.core_nthreads + 1;
+
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, get_count_order(nthreads), nthreads);
+ }
+
+ store_node(tscan, leaf.nnodes_per_socket + 1, leaf.node_id);
+
+ if (tscan->c->x86_vendor == X86_VENDOR_AMD) {
+ if (tscan->c->x86 == 0x15)
+ tscan->c->topo.cu_id = leaf.core_id;
+
+ cacheinfo_amd_init_llc_id(tscan->c, leaf.node_id);
+ } else {
+ /*
+ * Package ID is ApicId[6..] on certain Hygon CPUs. See
+ * commit e0ceeae708ce for explanation. The topology info
+ * is screwed up: The package shift is always 6 and the
+ * node ID is bit [4:5].
+ */
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR) && tscan->c->x86_model <= 0x3) {
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 6,
+ tscan->dom_ncpus[TOPO_CORE_DOMAIN]);
+ }
+ cacheinfo_hygon_init_llc_id(tscan->c);
+ }
+ return true;
+}
+
+static void parse_fam10h_node_id(struct topo_scan *tscan)
+{
+ union {
+ struct {
+ u64 node_id : 3,
+ nodes_per_pkg : 3,
+ unused : 58;
+ };
+ u64 msr;
+ } nid;
+
+ if (!boot_cpu_has(X86_FEATURE_NODEID_MSR))
+ return;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, nid.msr);
+ store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id);
+ tscan->c->topo.llc_id = nid.node_id;
+}
+
+static void legacy_set_llc(struct topo_scan *tscan)
+{
+ unsigned int apicid = tscan->c->topo.initial_apicid;
+
+ /* If none of the parsers set LLC ID then use the die ID for it. */
+ if (tscan->c->topo.llc_id == BAD_APICID)
+ tscan->c->topo.llc_id = apicid >> tscan->dom_shifts[TOPO_CORE_DOMAIN];
+}
+
+static void topoext_fixup(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u64 msrval;
+
+ /* Try to re-enable TopologyExtensions if switched off by BIOS */
+ if (cpu_has(c, X86_FEATURE_TOPOEXT) || c->x86_vendor != X86_VENDOR_AMD ||
+ c->x86 != 0x15 || c->x86_model < 0x10 || c->x86_model > 0x6f)
+ return;
+
+ if (msr_set_bit(0xc0011005, 54) <= 0)
+ return;
+
+ rdmsrl(0xc0011005, msrval);
+ if (msrval & BIT_64(54)) {
+ set_cpu_cap(c, X86_FEATURE_TOPOEXT);
+ pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
+ }
+}
+
+static void parse_topology_amd(struct topo_scan *tscan)
+{
+ bool has_topoext = false;
+
+ /*
+ * If the extended topology leaf 0x8000_001e is available
+ * try to get SMT, CORE, TILE, and DIE shifts from extended
+ * CPUID leaf 0x8000_0026 on supported processors first. If
+ * extended CPUID leaf 0x8000_0026 is not supported, try to
+ * get SMT and CORE shift from leaf 0xb first, then try to
+ * get the CORE shift from leaf 0x8000_0008.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT))
+ has_topoext = cpu_parse_topology_ext(tscan);
+
+ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES))
+ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026);
+
+ if (!has_topoext && !parse_8000_0008(tscan))
+ return;
+
+ /* Prefer leaf 0x8000001e if available */
+ if (parse_8000_001e(tscan, has_topoext))
+ return;
+
+ /* Try the NODEID MSR */
+ parse_fam10h_node_id(tscan);
+}
+
+void cpu_parse_topology_amd(struct topo_scan *tscan)
+{
+ tscan->amd_nodes_per_pkg = 1;
+ topoext_fixup(tscan);
+ parse_topology_amd(tscan);
+ legacy_set_llc(tscan);
+
+ if (tscan->amd_nodes_per_pkg > 1)
+ set_cpu_cap(tscan->c, X86_FEATURE_AMD_DCM);
+}
+
+void cpu_topology_fixup_amd(struct topo_scan *tscan)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+
+ /*
+ * Adjust the core_id relative to the node when there is more than
+ * one node.
+ */
+ if (tscan->c->x86 < 0x17 && tscan->amd_nodes_per_pkg > 1)
+ c->topo.core_id %= tscan->dom_ncpus[TOPO_CORE_DOMAIN] / tscan->amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
new file mode 100644
index 000000000000..b5a5e1411469
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <xen/xen.h>
+
+#include <asm/intel-family.h>
+#include <asm/apic.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+#include "cpu.h"
+
+struct x86_topology_system x86_topo_system __ro_after_init;
+EXPORT_SYMBOL_GPL(x86_topo_system);
+
+unsigned int __amd_nodes_per_pkg __ro_after_init;
+EXPORT_SYMBOL_GPL(__amd_nodes_per_pkg);
+
+void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom,
+ unsigned int shift, unsigned int ncpus)
+{
+ topology_update_dom(tscan, dom, shift, ncpus);
+
+ /* Propagate to the upper levels */
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ tscan->dom_shifts[dom] = tscan->dom_shifts[dom - 1];
+ tscan->dom_ncpus[dom] = tscan->dom_ncpus[dom - 1];
+ }
+}
+
+enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor == X86_VENDOR_INTEL) {
+ switch (c->topo.intel_type) {
+ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY;
+ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE;
+ }
+ }
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ switch (c->topo.amd_type) {
+ case 0: return TOPO_CPU_TYPE_PERFORMANCE;
+ case 1: return TOPO_CPU_TYPE_EFFICIENCY;
+ }
+ }
+
+ return TOPO_CPU_TYPE_UNKNOWN;
+}
+
+const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c)
+{
+ switch (get_topology_cpu_type(c)) {
+ case TOPO_CPU_TYPE_PERFORMANCE:
+ return "performance";
+ case TOPO_CPU_TYPE_EFFICIENCY:
+ return "efficiency";
+ default:
+ return "unknown";
+ }
+}
+
+static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c)
+{
+ struct {
+ u32 cache_type : 5,
+ unused : 21,
+ ncores : 6;
+ } eax;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_subleaf_reg(4, 0, CPUID_EAX, &eax);
+ if (!eax.cache_type)
+ return 1;
+
+ return eax.ncores + 1;
+}
+
+static void parse_legacy(struct topo_scan *tscan)
+{
+ unsigned int cores, core_shift, smt_shift = 0;
+ struct cpuinfo_x86 *c = tscan->c;
+
+ cores = parse_num_cores_legacy(c);
+ core_shift = get_count_order(cores);
+
+ if (cpu_has(c, X86_FEATURE_HT)) {
+ if (!WARN_ON_ONCE(tscan->ebx1_nproc_shift < core_shift))
+ smt_shift = tscan->ebx1_nproc_shift - core_shift;
+ /*
+ * The parser expects leaf 0xb/0x1f format, which means
+ * the number of logical processors at core level is
+ * counting threads.
+ */
+ core_shift += smt_shift;
+ cores <<= smt_shift;
+ }
+
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, smt_shift, 1U << smt_shift);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, core_shift, cores);
+}
+
+static bool fake_topology(struct topo_scan *tscan)
+{
+ /*
+ * Preset the CORE level shift for CPUID less systems and XEN_PV,
+ * which has useless CPUID information.
+ */
+ topology_set_dom(tscan, TOPO_SMT_DOMAIN, 0, 1);
+ topology_set_dom(tscan, TOPO_CORE_DOMAIN, 0, 1);
+
+ return tscan->c->cpuid_level < 1;
+}
+
+static void parse_topology(struct topo_scan *tscan, bool early)
+{
+ const struct cpuinfo_topology topo_defaults = {
+ .cu_id = 0xff,
+ .llc_id = BAD_APICID,
+ .l2c_id = BAD_APICID,
+ .cpu_type = TOPO_CPU_TYPE_UNKNOWN,
+ };
+ struct cpuinfo_x86 *c = tscan->c;
+ struct {
+ u32 unused0 : 16,
+ nproc : 8,
+ apicid : 8;
+ } ebx;
+
+ c->topo = topo_defaults;
+
+ if (fake_topology(tscan))
+ return;
+
+ /* Preset Initial APIC ID from CPUID leaf 1 */
+ cpuid_leaf_reg(1, CPUID_EBX, &ebx);
+ c->topo.initial_apicid = ebx.apicid;
+
+ /*
+ * The initial invocation from early_identify_cpu() happens before
+ * the APIC is mapped or X2APIC enabled. For establishing the
+ * topology, that's not required. Use the initial APIC ID.
+ */
+ if (early)
+ c->topo.apicid = c->topo.initial_apicid;
+ else
+ c->topo.apicid = read_apic_id();
+
+ /* The above is sufficient for UP */
+ if (!IS_ENABLED(CONFIG_SMP))
+ return;
+
+ tscan->ebx1_nproc_shift = get_count_order(ebx.nproc);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (IS_ENABLED(CONFIG_CPU_SUP_AMD))
+ cpu_parse_topology_amd(tscan);
+ break;
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_ZHAOXIN:
+ parse_legacy(tscan);
+ break;
+ case X86_VENDOR_INTEL:
+ if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan))
+ parse_legacy(tscan);
+ if (c->cpuid_level >= 0x1a)
+ c->topo.cpu_type = cpuid_eax(0x1a);
+ break;
+ case X86_VENDOR_HYGON:
+ if (IS_ENABLED(CONFIG_CPU_SUP_HYGON))
+ cpu_parse_topology_amd(tscan);
+ break;
+ }
+}
+
+static void topo_set_ids(struct topo_scan *tscan, bool early)
+{
+ struct cpuinfo_x86 *c = tscan->c;
+ u32 apicid = c->topo.apicid;
+
+ c->topo.pkg_id = topo_shift_apicid(apicid, TOPO_PKG_DOMAIN);
+ c->topo.die_id = topo_shift_apicid(apicid, TOPO_DIE_DOMAIN);
+
+ if (!early) {
+ c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
+ c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
+ }
+
+ /* Package relative core ID */
+ c->topo.core_id = (apicid & topo_domain_mask(TOPO_PKG_DOMAIN)) >>
+ x86_topo_system.dom_shifts[TOPO_SMT_DOMAIN];
+
+ c->topo.amd_node_id = tscan->amd_node_id;
+
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ cpu_topology_fixup_amd(tscan);
+}
+
+void cpu_parse_topology(struct cpuinfo_x86 *c)
+{
+ unsigned int dom, cpu = smp_processor_id();
+ struct topo_scan tscan = { .c = c, };
+
+ parse_topology(&tscan, false);
+
+ if (IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
+ if (c->topo.initial_apicid != c->topo.apicid) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. CPUID: 0x%04x APIC: 0x%04x\n",
+ cpu, c->topo.initial_apicid, c->topo.apicid);
+ }
+
+ if (c->topo.apicid != cpuid_to_apicid[cpu]) {
+ pr_err(FW_BUG "CPU%4u: APIC ID mismatch. Firmware: 0x%04x APIC: 0x%04x\n",
+ cpu, cpuid_to_apicid[cpu], c->topo.apicid);
+ }
+ }
+
+ for (dom = TOPO_SMT_DOMAIN; dom < TOPO_MAX_DOMAIN; dom++) {
+ if (tscan.dom_shifts[dom] == x86_topo_system.dom_shifts[dom])
+ continue;
+ pr_err(FW_BUG "CPU%d: Topology domain %u shift %u != %u\n", cpu, dom,
+ tscan.dom_shifts[dom], x86_topo_system.dom_shifts[dom]);
+ }
+
+ topo_set_ids(&tscan, false);
+}
+
+void __init cpu_init_topology(struct cpuinfo_x86 *c)
+{
+ struct topo_scan tscan = { .c = c, };
+ unsigned int dom, sft;
+
+ parse_topology(&tscan, true);
+
+ /* Copy the shift values and calculate the unit sizes. */
+ memcpy(x86_topo_system.dom_shifts, tscan.dom_shifts, sizeof(x86_topo_system.dom_shifts));
+
+ dom = TOPO_SMT_DOMAIN;
+ x86_topo_system.dom_size[dom] = 1U << x86_topo_system.dom_shifts[dom];
+
+ for (dom++; dom < TOPO_MAX_DOMAIN; dom++) {
+ sft = x86_topo_system.dom_shifts[dom] - x86_topo_system.dom_shifts[dom - 1];
+ x86_topo_system.dom_size[dom] = 1U << sft;
+ }
+
+ topo_set_ids(&tscan, true);
+
+ /*
+ * AMD systems have Nodes per package which cannot be mapped to
+ * APIC ID.
+ */
+ __amd_nodes_per_pkg = tscan.amd_nodes_per_pkg;
+}
diff --git a/arch/x86/kernel/cpu/topology_ext.c b/arch/x86/kernel/cpu/topology_ext.c
new file mode 100644
index 000000000000..467b0326bf1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/topology_ext.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cpu.h>
+
+#include <asm/apic.h>
+#include <asm/memtype.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+enum topo_types {
+ INVALID_TYPE = 0,
+ SMT_TYPE = 1,
+ CORE_TYPE = 2,
+ MAX_TYPE_0B = 3,
+ MODULE_TYPE = 3,
+ AMD_CCD_TYPE = 3,
+ TILE_TYPE = 4,
+ AMD_SOCKET_TYPE = 4,
+ MAX_TYPE_80000026 = 5,
+ DIE_TYPE = 5,
+ DIEGRP_TYPE = 6,
+ MAX_TYPE_1F = 7,
+};
+
+/*
+ * Use a lookup table for the case that there are future types > 6 which
+ * describe an intermediate domain level which does not exist today.
+ */
+static const unsigned int topo_domain_map_0b_1f[MAX_TYPE_1F] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [MODULE_TYPE] = TOPO_MODULE_DOMAIN,
+ [TILE_TYPE] = TOPO_TILE_DOMAIN,
+ [DIE_TYPE] = TOPO_DIE_DOMAIN,
+ [DIEGRP_TYPE] = TOPO_DIEGRP_DOMAIN,
+};
+
+static const unsigned int topo_domain_map_80000026[MAX_TYPE_80000026] = {
+ [SMT_TYPE] = TOPO_SMT_DOMAIN,
+ [CORE_TYPE] = TOPO_CORE_DOMAIN,
+ [AMD_CCD_TYPE] = TOPO_TILE_DOMAIN,
+ [AMD_SOCKET_TYPE] = TOPO_DIE_DOMAIN,
+};
+
+static inline bool topo_subleaf(struct topo_scan *tscan, u32 leaf, u32 subleaf,
+ unsigned int *last_dom)
+{
+ unsigned int dom, maxtype;
+ const unsigned int *map;
+ struct {
+ // eax
+ u32 x2apic_shift : 5, // Number of bits to shift APIC ID right
+ // for the topology ID at the next level
+ : 27; // Reserved
+ // ebx
+ u32 num_processors : 16, // Number of processors at current level
+ : 16; // Reserved
+ // ecx
+ u32 level : 8, // Current topology level. Same as sub leaf number
+ type : 8, // Level type. If 0, invalid
+ : 16; // Reserved
+ // edx
+ u32 x2apic_id : 32; // X2APIC ID of the current logical processor
+ } sl;
+
+ switch (leaf) {
+ case 0x0b: maxtype = MAX_TYPE_0B; map = topo_domain_map_0b_1f; break;
+ case 0x1f: maxtype = MAX_TYPE_1F; map = topo_domain_map_0b_1f; break;
+ case 0x80000026: maxtype = MAX_TYPE_80000026; map = topo_domain_map_80000026; break;
+ default: return false;
+ }
+
+ cpuid_subleaf(leaf, subleaf, &sl);
+
+ if (!sl.num_processors || sl.type == INVALID_TYPE)
+ return false;
+
+ if (sl.type >= maxtype) {
+ pr_err_once("Topology: leaf 0x%x:%d Unknown domain type %u\n",
+ leaf, subleaf, sl.type);
+ /*
+ * It really would have been too obvious to make the domain
+ * type space sparse and leave a few reserved types between
+ * the points which might change instead of following the
+ * usual "this can be fixed in software" principle.
+ */
+ dom = *last_dom + 1;
+ } else {
+ dom = map[sl.type];
+ *last_dom = dom;
+ }
+
+ if (!dom) {
+ tscan->c->topo.initial_apicid = sl.x2apic_id;
+ } else if (tscan->c->topo.initial_apicid != sl.x2apic_id) {
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf %d APIC ID mismatch %x != %x\n",
+ leaf, subleaf, tscan->c->topo.initial_apicid, sl.x2apic_id);
+ }
+
+ topology_set_dom(tscan, dom, sl.x2apic_shift, sl.num_processors);
+ return true;
+}
+
+static bool parse_topology_leaf(struct topo_scan *tscan, u32 leaf)
+{
+ unsigned int last_dom;
+ u32 subleaf;
+
+ /* Read all available subleafs and populate the levels */
+ for (subleaf = 0, last_dom = 0; topo_subleaf(tscan, leaf, subleaf, &last_dom); subleaf++);
+
+ /* If subleaf 0 failed to parse, give up */
+ if (!subleaf)
+ return false;
+
+ /*
+ * There are machines in the wild which have shift 0 in the subleaf
+ * 0, but advertise 2 logical processors at that level. They are
+ * truly SMT.
+ */
+ if (!tscan->dom_shifts[TOPO_SMT_DOMAIN] && tscan->dom_ncpus[TOPO_SMT_DOMAIN] > 1) {
+ unsigned int sft = get_count_order(tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+
+ pr_warn_once(FW_BUG "CPUID leaf 0x%x subleaf 0 has shift level 0 but %u CPUs. Fixing it up.\n",
+ leaf, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ topology_update_dom(tscan, TOPO_SMT_DOMAIN, sft, tscan->dom_ncpus[TOPO_SMT_DOMAIN]);
+ }
+
+ set_cpu_cap(tscan->c, X86_FEATURE_XTOPOLOGY);
+ return true;
+}
+
+bool cpu_parse_topology_ext(struct topo_scan *tscan)
+{
+ /* Intel: Try leaf 0x1F first. */
+ if (tscan->c->cpuid_level >= 0x1f && parse_topology_leaf(tscan, 0x1f))
+ return true;
+
+ /* AMD: Try leaf 0x80000026 first. */
+ if (tscan->c->extended_cpuid_level >= 0x80000026 && parse_topology_leaf(tscan, 0x80000026))
+ return true;
+
+ /* Intel/AMD: Fall back to leaf 0xB if available */
+ return tscan->c->cpuid_level >= 0x0b && parse_topology_leaf(tscan, 0x0b);
+}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 11f83d07925e..cb3f900c46fc 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -26,6 +26,7 @@
#include <linux/export.h>
#include <linux/clocksource.h>
#include <linux/cpu.h>
+#include <linux/efi.h>
#include <linux/reboot.h>
#include <linux/static_call.h>
#include <asm/div64.h>
@@ -41,80 +42,97 @@
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define CPUID_VMWARE_FEATURES_LEAF 0x40000010
-#define CPUID_VMWARE_FEATURES_ECX_VMMCALL BIT(0)
-#define CPUID_VMWARE_FEATURES_ECX_VMCALL BIT(1)
-#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
-
-#define VMWARE_CMD_GETVERSION 10
-#define VMWARE_CMD_GETHZ 45
-#define VMWARE_CMD_GETVCPU_INFO 68
-#define VMWARE_CMD_LEGACY_X2APIC 3
-#define VMWARE_CMD_VCPU_RESERVED 31
-#define VMWARE_CMD_STEALCLOCK 91
+#define GETVCPU_INFO_LEGACY_X2APIC BIT(3)
+#define GETVCPU_INFO_VCPU_RESERVED BIT(31)
#define STEALCLOCK_NOT_AVAILABLE (-1)
#define STEALCLOCK_DISABLED 0
#define STEALCLOCK_ENABLED 1
-#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx), %%eax" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(VMWARE_HYPERVISOR_PORT), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx) \
- __asm__("vmmcall" : \
- "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
- "a"(VMWARE_HYPERVISOR_MAGIC), \
- "c"(VMWARE_CMD_##cmd), \
- "d"(0), "b"(UINT_MAX) : \
- "memory")
-
-#define VMWARE_CMD(cmd, eax, ebx, ecx, edx) do { \
- switch (vmware_hypercall_mode) { \
- case CPUID_VMWARE_FEATURES_ECX_VMCALL: \
- VMWARE_VMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- case CPUID_VMWARE_FEATURES_ECX_VMMCALL: \
- VMWARE_VMMCALL(cmd, eax, ebx, ecx, edx); \
- break; \
- default: \
- VMWARE_PORT(cmd, eax, ebx, ecx, edx); \
- break; \
- } \
- } while (0)
-
struct vmware_steal_time {
union {
- uint64_t clock; /* stolen time counter in units of vtsc */
+ u64 clock; /* stolen time counter in units of vtsc */
struct {
/* only for little-endian */
- uint32_t clock_low;
- uint32_t clock_high;
+ u32 clock_low;
+ u32 clock_high;
};
};
- uint64_t reserved[7];
+ u64 reserved[7];
};
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
+unsigned long vmware_hypercall_slow(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ unsigned long out0, rbx, rcx, rdx, rsi, rdi;
+
+ switch (vmware_hypercall_mode) {
+ case CPUID_VMWARE_FEATURES_ECX_VMCALL:
+ asm_inline volatile ("vmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ case CPUID_VMWARE_FEATURES_ECX_VMMCALL:
+ asm_inline volatile ("vmmcall"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ default:
+ asm_inline volatile ("movw %[port], %%dx; inl (%%dx), %%eax"
+ : "=a" (out0), "=b" (rbx), "=c" (rcx),
+ "=d" (rdx), "=S" (rsi), "=D" (rdi)
+ : [port] "i" (VMWARE_HYPERVISOR_PORT),
+ "a" (VMWARE_HYPERVISOR_MAGIC),
+ "b" (in1),
+ "c" (cmd),
+ "d" (in3),
+ "S" (in4),
+ "D" (in5)
+ : "cc", "memory");
+ break;
+ }
+
+ if (out1)
+ *out1 = rbx;
+ if (out2)
+ *out2 = rcx;
+ if (out3)
+ *out3 = rdx;
+ if (out4)
+ *out4 = rsi;
+ if (out5)
+ *out5 = rdi;
+
+ return out0;
+}
+
static inline int __vmware_platform(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVERSION, eax, ebx, ecx, edx);
- return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
+ u32 eax, ebx, ecx;
+
+ eax = vmware_hypercall3(VMWARE_CMD_GETVERSION, 0, &ebx, &ecx);
+ return eax != UINT_MAX && ebx == VMWARE_HYPERVISOR_MAGIC;
}
static unsigned long vmware_get_tsc_khz(void)
@@ -166,21 +184,12 @@ static void __init vmware_cyc2ns_setup(void)
pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
}
-static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+static int vmware_cmd_stealclock(u32 addr_hi, u32 addr_lo)
{
- uint32_t result, info;
-
- asm volatile (VMWARE_HYPERCALL :
- "=a"(result),
- "=c"(info) :
- "a"(VMWARE_HYPERVISOR_MAGIC),
- "b"(0),
- "c"(VMWARE_CMD_STEALCLOCK),
- "d"(0),
- "S"(arg1),
- "D"(arg2) :
- "memory");
- return result;
+ u32 info;
+
+ return vmware_hypercall5(VMWARE_CMD_STEALCLOCK, 0, 0, addr_hi, addr_lo,
+ &info);
}
static bool stealclock_enable(phys_addr_t pa)
@@ -215,15 +224,15 @@ static bool vmware_is_stealclock_available(void)
* Return:
* The steal clock reading in ns.
*/
-static uint64_t vmware_steal_clock(int cpu)
+static u64 vmware_steal_clock(int cpu)
{
struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
- uint64_t clock;
+ u64 clock;
if (IS_ENABLED(CONFIG_64BIT))
clock = READ_ONCE(steal->clock);
else {
- uint32_t initial_high, low, high;
+ u32 initial_high, low, high;
do {
initial_high = READ_ONCE(steal->clock_high);
@@ -235,7 +244,7 @@ static uint64_t vmware_steal_clock(int cpu)
high = READ_ONCE(steal->clock_high);
} while (initial_high != high);
- clock = ((uint64_t)high << 32) | low;
+ clock = ((u64)high << 32) | low;
}
return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
@@ -389,13 +398,13 @@ static void __init vmware_set_capabilities(void)
static void __init vmware_platform_setup(void)
{
- uint32_t eax, ebx, ecx, edx;
- uint64_t lpj, tsc_khz;
+ u32 eax, ebx, ecx;
+ u64 lpj, tsc_khz;
- VMWARE_CMD(GETHZ, eax, ebx, ecx, edx);
+ eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);
if (ebx != UINT_MAX) {
- lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ lpj = tsc_khz = eax | (((u64)ebx) << 32);
do_div(tsc_khz, 1000);
WARN_ON(tsc_khz >> 32);
pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
@@ -421,6 +430,9 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !efi_enabled(EFI_BOOT))
+ x86_init.mpparse.find_mptable = mpparse_find_mptable;
+
vmware_paravirt_ops_setup();
#ifdef CONFIG_X86_IO_APIC
@@ -446,7 +458,7 @@ static u8 __init vmware_select_hypercall(void)
* If !boot_cpu_has(X86_FEATURE_HYPERVISOR), vmware_hypercall_mode
* intentionally defaults to 0.
*/
-static uint32_t __init vmware_platform(void)
+static u32 __init vmware_platform(void)
{
if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
unsigned int eax;
@@ -474,11 +486,64 @@ static uint32_t __init vmware_platform(void)
/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
static bool __init vmware_legacy_x2apic_available(void)
{
- uint32_t eax, ebx, ecx, edx;
- VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
- (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
+ u32 eax;
+
+ eax = vmware_hypercall1(VMWARE_CMD_GETVCPU_INFO, 0);
+ return !(eax & GETVCPU_INFO_VCPU_RESERVED) &&
+ (eax & GETVCPU_INFO_LEGACY_X2APIC);
+}
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+/*
+ * TDCALL[TDG.VP.VMCALL] uses %rax (arg0) and %rcx (arg2). Therefore,
+ * we remap those registers to %r12 and %r13, respectively.
+ */
+unsigned long vmware_tdx_hypercall(unsigned long cmd,
+ unsigned long in1, unsigned long in3,
+ unsigned long in4, unsigned long in5,
+ u32 *out1, u32 *out2, u32 *out3,
+ u32 *out4, u32 *out5)
+{
+ struct tdx_module_args args = {};
+
+ if (!hypervisor_is_type(X86_HYPER_VMWARE)) {
+ pr_warn_once("Incorrect usage\n");
+ return ULONG_MAX;
+ }
+
+ if (cmd & ~VMWARE_CMD_MASK) {
+ pr_warn_once("Out of range command %lx\n", cmd);
+ return ULONG_MAX;
+ }
+
+ args.rbx = in1;
+ args.rdx = in3;
+ args.rsi = in4;
+ args.rdi = in5;
+ args.r10 = VMWARE_TDX_VENDOR_LEAF;
+ args.r11 = VMWARE_TDX_HCALL_FUNC;
+ args.r12 = VMWARE_HYPERVISOR_MAGIC;
+ args.r13 = cmd;
+ /* CPL */
+ args.r15 = 0;
+
+ __tdx_hypercall(&args);
+
+ if (out1)
+ *out1 = args.rbx;
+ if (out2)
+ *out2 = args.r13;
+ if (out3)
+ *out3 = args.rdx;
+ if (out4)
+ *out4 = args.rsi;
+ if (out5)
+ *out5 = args.rdi;
+
+ return args.r12;
}
+EXPORT_SYMBOL_GPL(vmware_tdx_hypercall);
+#endif
#ifdef CONFIG_AMD_MEM_ENCRYPT
static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 415564a6523b..90eba7eb5335 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -71,10 +71,6 @@ static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
init_intel_cacheinfo(c);
- detect_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
if (c->cpuid_level > 9) {
unsigned int eax = cpuid_eax(10);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c92d88680dbf..340af8155658 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/memblock.h>
+#include <asm/bootparam.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
#include <asm/nmi.h>
@@ -40,6 +41,7 @@
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
+#include <asm/sev.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -59,6 +61,8 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_emergency_stop_pt();
+ kdump_sev_callback();
+
disable_local_APIC();
}
@@ -124,6 +128,18 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
+
+ /*
+ * Non-crash kexec calls enc_kexec_begin() while scheduling is still
+ * active. This allows the callback to wait until all in-flight
+ * shared<->private conversions are complete. In a crash scenario,
+ * enc_kexec_begin() gets called after all but one CPU have been shut
+ * down and interrupts have been disabled. This allows the callback to
+ * detect a race with the conversion and report it.
+ */
+ x86_platform.guest.enc_kexec_begin();
+ x86_platform.guest.enc_kexec_finish();
+
crash_save_cpu(regs, safe_smp_processor_id());
}
@@ -170,7 +186,7 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem)
int ret = 0;
/* Exclude the low 1M because it is always reserved */
- ret = crash_exclude_mem_range(cmem, 0, (1<<20)-1);
+ ret = crash_exclude_mem_range(cmem, 0, SZ_1M - 1);
if (ret)
return ret;
@@ -198,8 +214,8 @@ static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
}
/* Prepare elf headers. Return addr and size */
-static int prepare_elf_headers(struct kimage *image, void **addr,
- unsigned long *sz, unsigned long *nr_mem_ranges)
+static int prepare_elf_headers(void **addr, unsigned long *sz,
+ unsigned long *nr_mem_ranges)
{
struct crash_mem *cmem;
int ret;
@@ -221,7 +237,7 @@ static int prepare_elf_headers(struct kimage *image, void **addr,
*nr_mem_ranges = cmem->nr_ranges;
/* By default prepare 64bit headers */
- ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
+ ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz);
out:
vfree(cmem);
@@ -349,7 +365,7 @@ int crash_load_segments(struct kimage *image)
.buf_max = ULONG_MAX, .top_down = false };
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz, &pnum);
+ ret = prepare_elf_headers(&kbuf.buffer, &kbuf.bufsz, &pnum);
if (ret)
return ret;
@@ -386,8 +402,8 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
image->elf_load_addr = kbuf.mem;
- pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
return ret;
}
@@ -398,20 +414,26 @@ int crash_load_segments(struct kimage *image)
#undef pr_fmt
#define pr_fmt(fmt) "crash hp: " fmt
-/* These functions provide the value for the sysfs crash_hotplug nodes */
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void)
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
{
- return crash_check_update_elfcorehdr();
-}
-#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void)
-{
- return crash_check_update_elfcorehdr();
-}
+#ifdef CONFIG_KEXEC_FILE
+ if (image->file_mode)
+ return 1;
#endif
+ /*
+ * Initially, crash hotplug support for kexec_load was added
+ * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+ * functionality was expanded to accommodate multiple kexec
+ * segment updates, leading to the introduction of the
+ * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+ * when the kexec tool sends either of these flags, it indicates
+ * that the required kexec segment (elfcorehdr) is excluded from
+ * the SHA calculation.
+ */
+ return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+ kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
unsigned int arch_crash_get_elfcorehdr_size(void)
{
@@ -428,10 +450,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void)
/**
* arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
* @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ * NULL for CPU hotplug case.
*
* Prepare the new elfcorehdr and replace the existing elfcorehdr.
*/
-void arch_crash_handle_hotplug_event(struct kimage *image)
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
{
void *elfbuf = NULL, *old_elfcorehdr;
unsigned long nr_mem_ranges;
@@ -452,7 +476,7 @@ void arch_crash_handle_hotplug_event(struct kimage *image)
* Create the new elfcorehdr reflecting the changes to CPU and/or
* memory resources.
*/
- if (prepare_elf_headers(image, &elfbuf, &elfsz, &nr_mem_ranges)) {
+ if (prepare_elf_headers(&elfbuf, &elfsz, &nr_mem_ranges)) {
pr_err("unable to create new elfcorehdr");
goto out;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index afd09924094e..dd8748c45529 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,6 +2,7 @@
/*
* Architecture specific OF callbacks.
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/io.h>
#include <linux/interrupt.h>
@@ -24,6 +25,7 @@
#include <asm/pci_x86.h>
#include <asm/setup.h>
#include <asm/i8259.h>
+#include <asm/numa.h>
#include <asm/prom.h>
__initdata u64 initial_dtb;
@@ -82,7 +84,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)
ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (ret)
- return ret;
+ return pcibios_err_to_errno(ret);
if (!pin)
return 0;
@@ -136,7 +138,8 @@ static void __init dtb_cpu_setup(void)
pr_warn("%pOF: missing local APIC ID\n", dn);
continue;
}
- generic_processor_info(apic_id);
+ topology_register_apic(apic_id, CPU_ACPIID_INVALID, true);
+ set_apicid_to_node(apic_id, of_node_to_nid(dn));
}
}
@@ -277,36 +280,40 @@ static void __init dtb_apic_setup(void)
dtb_ioapic_setup();
}
-#ifdef CONFIG_OF_EARLY_FLATTREE
+static void __init x86_dtb_parse_smp_config(void)
+{
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
+
void __init x86_flattree_get_config(void)
{
+#ifdef CONFIG_OF_EARLY_FLATTREE
u32 size, map_len;
void *dt;
- if (!initial_dtb)
- return;
+ if (initial_dtb) {
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+ dt = early_memremap(initial_dtb, map_len);
+ size = fdt_totalsize(dt);
+ if (map_len < size) {
+ early_memunmap(dt, map_len);
+ dt = early_memremap(initial_dtb, size);
+ map_len = size;
+ }
- dt = early_memremap(initial_dtb, map_len);
- size = fdt_totalsize(dt);
- if (map_len < size) {
- early_memunmap(dt, map_len);
- dt = early_memremap(initial_dtb, size);
- map_len = size;
+ early_init_dt_verify(dt, __pa(dt));
}
- early_init_dt_verify(dt);
unflatten_and_copy_device_tree();
- early_memunmap(dt, map_len);
-}
-#endif
-void __init x86_dtb_init(void)
-{
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
+ if (initial_dtb)
+ early_memunmap(dt, map_len);
+#endif
+ if (acpi_disabled && of_have_populated_dt())
+ x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index f18ca44c904b..a7d562697e50 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -405,12 +405,12 @@ static void __die_header(const char *str, struct pt_regs *regs, long err)
pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
printk(KERN_DEFAULT
- "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
- pr,
+ "Oops: %s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff,
+ ++die_counter, pr,
IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
- IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+ IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
}
NOKPROBE_SYMBOL(__die_header);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index fb8cf953380d..82b96ed9890a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -532,9 +532,10 @@ u64 __init e820__range_update(u64 start, u64 size, enum e820_type old_type, enum
return __e820__range_update(e820_table, start, size, old_type, new_type);
}
-static u64 __init e820__range_update_kexec(u64 start, u64 size, enum e820_type old_type, enum e820_type new_type)
+u64 __init e820__range_update_table(struct e820_table *t, u64 start, u64 size,
+ enum e820_type old_type, enum e820_type new_type)
{
- return __e820__range_update(e820_table_kexec, start, size, old_type, new_type);
+ return __e820__range_update(t, start, size, old_type, new_type);
}
/* Remove a range of memory from the E820 table: */
@@ -806,7 +807,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
addr = memblock_phys_alloc(size, align);
if (addr) {
- e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
+ e820__range_update_table(e820_table_kexec, addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
e820__update_table_kexec();
}
@@ -827,7 +828,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type type)
+static unsigned long __init e820__end_ram_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
@@ -838,7 +839,8 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long start_pfn;
unsigned long end_pfn;
- if (entry->type != type)
+ if (entry->type != E820_TYPE_RAM &&
+ entry->type != E820_TYPE_ACPI)
continue;
start_pfn = entry->addr >> PAGE_SHIFT;
@@ -864,12 +866,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, enum e820_type
unsigned long __init e820__end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN, E820_TYPE_RAM);
+ return e820__end_ram_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820__end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_TYPE_RAM);
+ return e820__end_ram_pfn(1UL << (32 - PAGE_SHIFT));
}
static void __init early_panic(char *msg)
@@ -1016,15 +1018,6 @@ void __init e820__reserve_setup_data(void)
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- /*
- * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need
- * to be reserved.
- */
- if (data->type != SETUP_EFI && data->type != SETUP_IMA)
- e820__range_update_kexec(pa_data,
- sizeof(*data) + data->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
-
if (data->type == SETUP_INDIRECT) {
len += data->len;
early_memunmap(data, sizeof(*data));
@@ -1036,12 +1029,9 @@ void __init e820__reserve_setup_data(void)
indirect = (struct setup_indirect *)data->data;
- if (indirect->type != SETUP_INDIRECT) {
+ if (indirect->type != SETUP_INDIRECT)
e820__range_update(indirect->addr, indirect->len,
E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- e820__range_update_kexec(indirect->addr, indirect->len,
- E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
- }
}
pa_data = pa_next;
@@ -1049,7 +1039,6 @@ void __init e820__reserve_setup_data(void)
}
e820__update_table(e820_table);
- e820__update_table(e820_table_kexec);
pr_info("extended physical RAM map:\n");
e820__print_table("reserve setup_data");
@@ -1157,11 +1146,8 @@ void __init e820__reserve_resources(void)
struct resource *res;
u64 end;
- res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
+ res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries,
SMP_CACHE_BYTES);
- if (!res)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(*res) * e820_table->nr_entries);
e820_res = res;
for (i = 0; i < e820_table->nr_entries; i++) {
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a6c1867fc7aa..6b6f32f40cbe 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,8 +17,8 @@
#include <linux/bcma/bcma.h>
#include <linux/bcma/bcma_regs.h>
#include <linux/platform_data/x86/apple.h>
-#include <drm/i915_drm.h>
-#include <drm/i915_pciids.h>
+#include <drm/intel/i915_drm.h>
+#include <drm/intel/pciids.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
#include <asm/io_apic.h>
@@ -518,47 +518,46 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
- INTEL_I830_IDS(&i830_early_ops),
- INTEL_I845G_IDS(&i845_early_ops),
- INTEL_I85X_IDS(&i85x_early_ops),
- INTEL_I865G_IDS(&i865_early_ops),
- INTEL_I915G_IDS(&gen3_early_ops),
- INTEL_I915GM_IDS(&gen3_early_ops),
- INTEL_I945G_IDS(&gen3_early_ops),
- INTEL_I945GM_IDS(&gen3_early_ops),
- INTEL_VLV_IDS(&gen6_early_ops),
- INTEL_PINEVIEW_G_IDS(&gen3_early_ops),
- INTEL_PINEVIEW_M_IDS(&gen3_early_ops),
- INTEL_I965G_IDS(&gen3_early_ops),
- INTEL_G33_IDS(&gen3_early_ops),
- INTEL_I965GM_IDS(&gen3_early_ops),
- INTEL_GM45_IDS(&gen3_early_ops),
- INTEL_G45_IDS(&gen3_early_ops),
- INTEL_IRONLAKE_D_IDS(&gen3_early_ops),
- INTEL_IRONLAKE_M_IDS(&gen3_early_ops),
- INTEL_SNB_D_IDS(&gen6_early_ops),
- INTEL_SNB_M_IDS(&gen6_early_ops),
- INTEL_IVB_M_IDS(&gen6_early_ops),
- INTEL_IVB_D_IDS(&gen6_early_ops),
- INTEL_HSW_IDS(&gen6_early_ops),
- INTEL_BDW_IDS(&gen8_early_ops),
- INTEL_CHV_IDS(&chv_early_ops),
- INTEL_SKL_IDS(&gen9_early_ops),
- INTEL_BXT_IDS(&gen9_early_ops),
- INTEL_KBL_IDS(&gen9_early_ops),
- INTEL_CFL_IDS(&gen9_early_ops),
- INTEL_GLK_IDS(&gen9_early_ops),
- INTEL_CNL_IDS(&gen9_early_ops),
- INTEL_ICL_11_IDS(&gen11_early_ops),
- INTEL_EHL_IDS(&gen11_early_ops),
- INTEL_JSL_IDS(&gen11_early_ops),
- INTEL_TGL_12_IDS(&gen11_early_ops),
- INTEL_RKL_IDS(&gen11_early_ops),
- INTEL_ADLS_IDS(&gen11_early_ops),
- INTEL_ADLP_IDS(&gen11_early_ops),
- INTEL_ADLN_IDS(&gen11_early_ops),
- INTEL_RPLS_IDS(&gen11_early_ops),
- INTEL_RPLP_IDS(&gen11_early_ops),
+ INTEL_I830_IDS(INTEL_VGA_DEVICE, &i830_early_ops),
+ INTEL_I845G_IDS(INTEL_VGA_DEVICE, &i845_early_ops),
+ INTEL_I85X_IDS(INTEL_VGA_DEVICE, &i85x_early_ops),
+ INTEL_I865G_IDS(INTEL_VGA_DEVICE, &i865_early_ops),
+ INTEL_I915G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I915GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I945GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_VLV_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_PNV_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965G_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G33_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_I965GM_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_GM45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_G45_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_ILK_IDS(INTEL_VGA_DEVICE, &gen3_early_ops),
+ INTEL_SNB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_IVB_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_HSW_IDS(INTEL_VGA_DEVICE, &gen6_early_ops),
+ INTEL_BDW_IDS(INTEL_VGA_DEVICE, &gen8_early_ops),
+ INTEL_CHV_IDS(INTEL_VGA_DEVICE, &chv_early_ops),
+ INTEL_SKL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_BXT_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_KBL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CFL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_WHL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CML_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_GLK_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_CNL_IDS(INTEL_VGA_DEVICE, &gen9_early_ops),
+ INTEL_ICL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_EHL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_JSL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_TGL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RKL_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_ADLN_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLS_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLU_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
+ INTEL_RPLP_IDS(INTEL_VGA_DEVICE, &gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
@@ -779,13 +778,13 @@ static int __init check_dev_quirk(int num, int slot, int func)
type = read_pci_config_byte(num, slot, func,
PCI_HEADER_TYPE);
- if ((type & 0x7f) == PCI_HEADER_TYPE_BRIDGE) {
+ if ((type & PCI_HEADER_TYPE_MASK) == PCI_HEADER_TYPE_BRIDGE) {
sec = read_pci_config_byte(num, slot, func, PCI_SECONDARY_BUS);
if (sec > num)
early_pci_scan_bus(sec);
}
- if (!(type & 0x80))
+ if (!(type & PCI_HEADER_TYPE_MFD))
return -1;
return 0;
diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c
index e963344b0449..9535a6507db7 100644
--- a/arch/x86/kernel/eisa.c
+++ b/arch/x86/kernel/eisa.c
@@ -2,6 +2,7 @@
/*
* EISA specific code
*/
+#include <linux/cc_platform.h>
#include <linux/ioport.h>
#include <linux/eisa.h>
#include <linux/io.h>
@@ -10,15 +11,15 @@
static __init int eisa_bus_probe(void)
{
- void __iomem *p;
+ u32 *p;
- if (xen_pv_domain() && !xen_initial_domain())
+ if ((xen_pv_domain() && !xen_initial_domain()) || cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
return 0;
- p = ioremap(0x0FFFD9, 4);
- if (p && readl(p) == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
+ p = memremap(0x0FFFD9, 4, MEMREMAP_WB);
+ if (p && *p == 'E' + ('I' << 8) + ('S' << 16) + ('A' << 24))
EISA_bus = 1;
- iounmap(p);
+ memunmap(p);
return 0;
}
subsys_initcall(eisa_bus_probe);
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 16f9814c9be0..6726e0473d0b 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -106,6 +106,10 @@ void __init init_espfix_bsp(void)
pgd_t *pgd;
p4d_t *p4d;
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
/* Install the espfix pud into the kernel page directory */
pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
@@ -129,6 +133,10 @@ void init_espfix_ap(int cpu)
void *stack_page;
pteval_t ptemask;
+ /* FRED systems always restore the full value of %rsp */
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ return;
+
/* We only have to do this once... */
if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index 794e70151203..edbafc5940e3 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -2,6 +2,9 @@
/*
* x86 FPU bug checks:
*/
+#include <linux/printk.h>
+
+#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
/*
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index a21a4d0ecc34..1209c7aebb21 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -145,8 +145,8 @@ void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask)
asm volatile(
"fnclex\n\t"
"emms\n\t"
- "fildl %P[addr]" /* set F?P to defined value */
- : : [addr] "m" (fpstate));
+ "fildl %[addr]" /* set F?P to defined value */
+ : : [addr] "m" (*fpstate));
}
if (use_xsave()) {
@@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd);
* Must be invoked from KVM after a VMEXIT before enabling interrupts when
* XFD write emulation is disabled. This is required because the guest can
* freely modify XFD and the state at VMEXIT is not guaranteed to be the
- * same as the state on VMENTER. So software state has to be udpated before
+ * same as the state on VMENTER. So software state has to be updated before
* any operation which depends on it can take place.
*
* Note: It can be invoked unconditionally even when write emulation is
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 6bc1eb2a21bd..887b0b8e21e3 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -190,7 +190,8 @@ int ssp_get(struct task_struct *target, const struct user_regset *regset,
struct fpu *fpu = &target->thread.fpu;
struct cet_user_state *cetregs;
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
+ !ssp_active(target, regset))
return -ENODEV;
sync_fpstate(fpu);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 558076dbde5b..8f62e0666dea 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -156,10 +156,11 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
return !err;
}
-static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
{
if (use_xsave())
- return xsave_to_user_sigframe(buf);
+ return xsave_to_user_sigframe(buf, pkru);
+
if (use_fxsr())
return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
else
@@ -185,7 +186,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
* indicating the absence/presence of the extended state to the user.
*/
-bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
{
struct task_struct *tsk = current;
struct fpstate *fpstate = tsk->thread.fpu.fpstate;
@@ -228,7 +229,7 @@ retry:
fpregs_restore_userregs();
pagefault_disable();
- ret = copy_fpregs_to_sigframe(buf_fx);
+ ret = copy_fpregs_to_sigframe(buf_fx, pkru);
pagefault_enable();
fpregs_unlock();
@@ -274,12 +275,13 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures,
* Attempt to restore the FPU registers directly from user memory.
* Pagefaults are handled and any errors returned are fatal.
*/
-static bool restore_fpregs_from_user(void __user *buf, u64 xrestore,
- bool fx_only, unsigned int size)
+static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only)
{
struct fpu *fpu = &current->thread.fpu;
int ret;
+ /* Restore enabled features only. */
+ xrestore &= fpu->fpstate->user_xfeatures;
retry:
fpregs_lock();
/* Ensure that XFD is up to date */
@@ -309,7 +311,7 @@ retry:
if (ret != X86_TRAP_PF)
return false;
- if (!fault_in_readable(buf, size))
+ if (!fault_in_readable(buf, fpu->fpstate->user_size))
goto retry;
return false;
}
@@ -339,7 +341,6 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
struct user_i387_ia32_struct env;
bool success, fx_only = false;
union fpregs_state *fpregs;
- unsigned int state_size;
u64 user_xfeatures = 0;
if (use_xsave()) {
@@ -349,17 +350,14 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx,
return false;
fx_only = !fx_sw_user.magic1;
- state_size = fx_sw_user.xstate_size;
user_xfeatures = fx_sw_user.xfeatures;
} else {
user_xfeatures = XFEATURE_MASK_FPSSE;
- state_size = fpu->fpstate->user_size;
}
if (likely(!ia32_fxstate)) {
/* Restore the FPU registers directly from user memory. */
- return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only,
- state_size);
+ return restore_fpregs_from_user(buf_fx, user_xfeatures, fx_only);
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 117e74c44e75..27417b685c1d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -13,16 +13,20 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/fpu/api.h>
#include <asm/fpu/regset.h>
#include <asm/fpu/signal.h>
#include <asm/fpu/xcr.h>
+#include <asm/cpuid.h>
#include <asm/tlbflush.h>
#include <asm/prctl.h>
#include <asm/elf.h>
+#include <uapi/asm/elf.h>
+
#include "context.h"
#include "internal.h"
#include "legacy.h"
@@ -178,10 +182,11 @@ void fpu__init_cpu_xstate(void)
* Must happen after CR4 setup and before xsetbv() to allow KVM
* lazy passthrough. Write independent of the dynamic state static
* key as that does not work on the boot CPU. This also ensures
- * that any stale state is wiped out from XFD.
+ * that any stale state is wiped out from XFD. Reset the per CPU
+ * xfd cache too.
*/
if (cpu_feature_enabled(X86_FEATURE_XFD))
- wrmsrl(MSR_IA32_XFD, init_fpstate.xfd);
+ xfd_set_state(init_fpstate.xfd);
/*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
@@ -228,7 +233,7 @@ static void __init setup_xstate_cache(void)
xmm_space);
for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
- cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
xstate_sizes[i] = eax;
xstate_flags[i] = ecx;
@@ -394,7 +399,7 @@ int xfeature_size(int xfeature_nr)
u32 eax, ebx, ecx, edx;
CHECK_XFEATURE(xfeature_nr);
- cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
return eax;
}
@@ -437,9 +442,9 @@ static void __init __xstate_dump_leaves(void)
* just in case there are some goodies up there
*/
for (i = 0; i < XFEATURE_MAX + 10; i++) {
- cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
- XSTATE_CPUID, i, eax, ebx, ecx, edx);
+ CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
}
}
@@ -480,7 +485,7 @@ static int __init check_xtile_data_against_struct(int size)
* Check the maximum palette id:
* eax: the highest numbered palette subleaf.
*/
- cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
/*
* Cross-check each tile size and find the maximum number of
@@ -494,7 +499,7 @@ static int __init check_xtile_data_against_struct(int size)
* eax[31:16]: bytes per title
* ebx[31:16]: the max names (or max number of tiles)
*/
- cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
+ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
tile_size = eax >> 16;
max = ebx >> 16;
@@ -629,7 +634,7 @@ static unsigned int __init get_compacted_size(void)
* are no supervisor states, but XSAVEC still uses compacted
* format.
*/
- cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
return ebx;
}
@@ -670,7 +675,7 @@ static unsigned int __init get_xsave_size_user(void)
* containing all the *user* state components
* corresponding to bits currently set in XCR0.
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
return ebx;
}
@@ -759,21 +764,16 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
return;
}
- if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
- WARN_ON_FPU(1);
- return;
- }
-
/*
* Find user xstates supported by the processor.
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
/*
* Find supervisor xstates supported by the processor.
*/
- cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
@@ -787,6 +787,9 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
goto out_disable;
}
+ fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features &
+ XFEATURE_MASK_INDEPENDENT;
+
/*
* Clear XSAVE features that are disabled in the normal CPUID.
*/
@@ -990,6 +993,20 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
return __raw_xsave_addr(xsave, xfeature_nr);
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * Given an xstate feature nr, calculate where in the xsave buffer the state is.
+ * The xsave buffer should be in standard format, not compacted (e.g. user mode
+ * signal frames).
+ */
+void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
+{
+ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
+ return NULL;
+
+ return (void __user *)xsave + xstate_offsets[xfeature_nr];
+}
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -1433,8 +1450,8 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor)
return rstor;
/*
- * XSAVE(S): clone(), fpu_swap_kvm_fpu()
- * XRSTORS(S): fpu_swap_kvm_fpu()
+ * XSAVE(S): clone(), fpu_swap_kvm_fpstate()
+ * XRSTORS(S): fpu_swap_kvm_fpstate()
*/
/*
@@ -1836,3 +1853,89 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
+
+#ifdef CONFIG_COREDUMP
+static const char owner_name[] = "LINUX";
+
+/*
+ * Dump type, size, offset and flag values for every xfeature that is present.
+ */
+static int dump_xsave_layout_desc(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ int i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features) {
+ struct x86_xfeat_component xc = {
+ .type = i,
+ .size = xstate_sizes[i],
+ .offset = xstate_offsets[i],
+ /* reserved for future use */
+ .flags = 0,
+ };
+
+ if (!dump_emit(cprm, &xc, sizeof(xc)))
+ return 0;
+
+ num_records++;
+ }
+ return num_records;
+}
+
+static u32 get_xsave_desc_size(void)
+{
+ u32 cnt = 0;
+ u32 i;
+
+ for_each_extended_xfeature(i, fpu_user_cfg.max_features)
+ cnt++;
+
+ return cnt * (sizeof(struct x86_xfeat_component));
+}
+
+int elf_coredump_extra_notes_write(struct coredump_params *cprm)
+{
+ int num_records = 0;
+ struct elf_note en;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ en.n_namesz = sizeof(owner_name);
+ en.n_descsz = get_xsave_desc_size();
+ en.n_type = NT_X86_XSAVE_LAYOUT;
+
+ if (!dump_emit(cprm, &en, sizeof(en)))
+ return 1;
+ if (!dump_emit(cprm, owner_name, en.n_namesz))
+ return 1;
+ if (!dump_align(cprm, 4))
+ return 1;
+
+ num_records = dump_xsave_layout_desc(cprm);
+ if (!num_records)
+ return 1;
+
+ /* Total size should be equal to the number of records */
+ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz)
+ return 1;
+
+ return 0;
+}
+
+int elf_coredump_extra_notes_size(void)
+{
+ int size;
+
+ if (!fpu_user_cfg.max_features)
+ return 0;
+
+ /* .note header */
+ size = sizeof(struct elf_note);
+ /* Name plus alignment to 4 bytes */
+ size += roundup(sizeof(owner_name), 4);
+ size += get_xsave_desc_size();
+
+ return size;
+}
+#endif /* CONFIG_COREDUMP */
diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h
index 3518fb26d06b..aa16f1a1bbcf 100644
--- a/arch/x86/kernel/fpu/xstate.h
+++ b/arch/x86/kernel/fpu/xstate.h
@@ -54,7 +54,7 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system_xstate(unsigned int legacy_size);
-extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
+extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
static inline u64 xfeatures_mask_supervisor(void)
{
@@ -64,9 +64,31 @@ static inline u64 xfeatures_mask_supervisor(void)
static inline u64 xfeatures_mask_independent(void)
{
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR))
- return XFEATURE_MASK_INDEPENDENT & ~XFEATURE_MASK_LBR;
+ return fpu_kernel_cfg.independent_features & ~XFEATURE_MASK_LBR;
- return XFEATURE_MASK_INDEPENDENT;
+ return fpu_kernel_cfg.independent_features;
+}
+
+/*
+ * Update the value of PKRU register that was already pushed onto the signal frame.
+ */
+static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru)
+{
+ u64 xstate_bv;
+ int err;
+
+ if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
+ return 0;
+
+ /* Mark PKRU as in-use so that it is restored correctly. */
+ xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU;
+
+ err = __put_user(xstate_bv, &buf->header.xfeatures);
+ if (err)
+ return err;
+
+ /* Update PKRU value in the userspace xsave buffer. */
+ return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
}
/* XSAVE/XRSTOR wrapper functions */
@@ -108,21 +130,17 @@ static inline u64 xfeatures_mask_independent(void)
* Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT
* supports modified optimization which is not supported by XSAVE.
*
- * We use XSAVE as a fallback.
- *
- * The 661 label is defined in the ALTERNATIVE* macros as the address of the
- * original instruction which gets replaced. We need to use it here as the
- * address of the instruction where we might get an exception at.
+ * Use XSAVE as a fallback.
*/
#define XSTATE_XSAVE(st, lmask, hmask, err) \
- asm volatile(ALTERNATIVE_3(XSAVE, \
+ asm volatile("1: " ALTERNATIVE_3(XSAVE, \
XSAVEOPT, X86_FEATURE_XSAVEOPT, \
XSAVEC, X86_FEATURE_XSAVEC, \
XSAVES, X86_FEATURE_XSAVES) \
"\n" \
"xor %[err], %[err]\n" \
"3:\n" \
- _ASM_EXTABLE_TYPE_REG(661b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
+ _ASM_EXTABLE_TYPE_REG(1b, 3b, EX_TYPE_EFAULT_REG, %[err]) \
: [err] "=r" (err) \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -132,11 +150,11 @@ static inline u64 xfeatures_mask_independent(void)
* XSAVE area format.
*/
#define XSTATE_XRESTORE(st, lmask, hmask) \
- asm volatile(ALTERNATIVE(XRSTOR, \
+ asm volatile("1: " ALTERNATIVE(XRSTOR, \
XRSTORS, X86_FEATURE_XSAVES) \
"\n" \
"3:\n" \
- _ASM_EXTABLE_TYPE(661b, 3b, EX_TYPE_FPU_RESTORE) \
+ _ASM_EXTABLE_TYPE(1b, 3b, EX_TYPE_FPU_RESTORE) \
: \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -148,20 +166,26 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs
#endif
#ifdef CONFIG_X86_64
+static inline void xfd_set_state(u64 xfd)
+{
+ wrmsrl(MSR_IA32_XFD, xfd);
+ __this_cpu_write(xfd_state, xfd);
+}
+
static inline void xfd_update_state(struct fpstate *fpstate)
{
if (fpu_state_size_dynamic()) {
u64 xfd = fpstate->xfd;
- if (__this_cpu_read(xfd_state) != xfd) {
- wrmsrl(MSR_IA32_XFD, xfd);
- __this_cpu_write(xfd_state, xfd);
- }
+ if (__this_cpu_read(xfd_state) != xfd)
+ xfd_set_state(xfd);
}
}
extern int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu);
#else
+static inline void xfd_set_state(u64 xfd) { }
+
static inline void xfd_update_state(struct fpstate *fpstate) { }
static inline int __xfd_enable_feature(u64 which, struct fpu_guest *guest_fpu) {
@@ -254,7 +278,7 @@ static inline u64 xfeatures_need_sigframe_write(void)
* The caller has to zero buf::header before calling this because XSAVE*
* does not touch the reserved fields in the header.
*/
-static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
+static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkru)
{
/*
* Include the features which are not xsaved/rstored by the kernel
@@ -279,6 +303,9 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf)
XSTATE_OP(XSAVE, buf, lmask, hmask, err);
clac();
+ if (!err)
+ err = update_pkru_in_sigframe(buf, mask, pkru);
+
return err;
}
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
new file mode 100644
index 000000000000..5e2cd1004980
--- /dev/null
+++ b/arch/x86/kernel/fred.c
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/tlbflush.h>
+#include <asm/traps.h>
+
+/* #DB in the kernel would imply the use of a kernel debugger. */
+#define FRED_DB_STACK_LEVEL 1UL
+#define FRED_NMI_STACK_LEVEL 2UL
+#define FRED_MC_STACK_LEVEL 2UL
+/*
+ * #DF is the highest level because a #DF means "something went wrong
+ * *while delivering an exception*." The number of cases for which that
+ * can happen with FRED is drastically reduced and basically amounts to
+ * "the stack you pointed me to is broken." Thus, always change stacks
+ * on #DF, which means it should be at the highest level.
+ */
+#define FRED_DF_STACK_LEVEL 3UL
+
+#define FRED_STKLVL(vector, lvl) ((lvl) << (2 * (vector)))
+
+DEFINE_PER_CPU(unsigned long, fred_rsp0);
+EXPORT_PER_CPU_SYMBOL(fred_rsp0);
+
+void cpu_init_fred_exceptions(void)
+{
+ /* When FRED is enabled by default, remove this log message */
+ pr_info("Initialize FRED on CPU%d\n", smp_processor_id());
+
+ /*
+ * If a kernel event is delivered before a CPU goes to user level for
+ * the first time, its SS is NULL thus NULL is pushed into the SS field
+ * of the FRED stack frame. But before ERETS is executed, the CPU may
+ * context switch to another task and go to user level. Then when the
+ * CPU comes back to kernel mode, SS is changed to __KERNEL_DS. Later
+ * when ERETS is executed to return from the kernel event handler, a #GP
+ * fault is generated because SS doesn't match the SS saved in the FRED
+ * stack frame.
+ *
+ * Initialize SS to __KERNEL_DS when enabling FRED to avoid such #GPs.
+ */
+ loadsegment(ss, __KERNEL_DS);
+
+ wrmsrl(MSR_IA32_FRED_CONFIG,
+ /* Reserve for CALL emulation */
+ FRED_CONFIG_REDZONE |
+ FRED_CONFIG_INT_STKLVL(0) |
+ FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user));
+
+ wrmsrl(MSR_IA32_FRED_STKLVLS, 0);
+
+ /*
+ * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be
+ * resynchronized with its per-CPU cache.
+ */
+ wrmsrl(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0));
+
+ wrmsrl(MSR_IA32_FRED_RSP1, 0);
+ wrmsrl(MSR_IA32_FRED_RSP2, 0);
+ wrmsrl(MSR_IA32_FRED_RSP3, 0);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ /* Any further IDT use is a bug */
+ idt_invalidate();
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/* Must be called after setup_cpu_entry_areas() */
+void cpu_init_fred_rsps(void)
+{
+ /*
+ * The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
+ * (remember that user space faults are always taken on stack level 0)
+ * is to avoid overflowing the kernel stack.
+ */
+ wrmsrl(MSR_IA32_FRED_STKLVLS,
+ FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) |
+ FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL));
+
+ /* The FRED equivalents to IST stacks... */
+ wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 12df54ff0e81..166bc0ea3bdf 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -25,6 +25,7 @@
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include <trace/syscall.h>
@@ -117,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code,
return ret;
/* replace the text with the new text */
- if (ftrace_poke_late)
+ if (ftrace_poke_late) {
text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
- else
- text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ } else {
+ mutex_lock(&text_mutex);
+ text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+ }
return 0;
}
@@ -260,25 +264,14 @@ void arch_ftrace_update_code(int command)
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
-#ifdef CONFIG_MODULES
-#include <linux/moduleloader.h>
-/* Module allocation simplifies allocating memory for code */
static inline void *alloc_tramp(unsigned long size)
{
- return module_alloc(size);
+ return execmem_alloc(EXECMEM_FTRACE, size);
}
static inline void tramp_free(void *tramp)
{
- module_memfree(tramp);
+ execmem_free(tramp);
}
-#else
-/* Trampolines can only be created if modules are supported */
-static inline void *alloc_tramp(unsigned long size)
-{
- return NULL;
-}
-static inline void tramp_free(void *tramp) { }
-#endif
/* Defined as markers to the end of the ftrace default trampolines */
extern void ftrace_regs_caller_end(void);
@@ -307,7 +300,8 @@ union ftrace_op_code_union {
} __attribute__((packed));
};
-#define RET_SIZE (IS_ENABLED(CONFIG_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_SLS))
+#define RET_SIZE \
+ (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) ? 5 : 1 + IS_ENABLED(CONFIG_MITIGATION_SLS))
static unsigned long
create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
@@ -327,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 };
unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE };
union ftrace_op_code_union op_ptr;
- int ret;
+ void *ret;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
start_offset = (unsigned long)ftrace_regs_caller;
@@ -358,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE);
/* Copy ftrace_caller onto the trampoline memory */
- ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size);
- if (WARN_ON(ret < 0))
+ ret = text_poke_copy(trampoline, (void *)start_offset, size);
+ if (WARN_ON(!ret))
goto fail;
ip = trampoline + size;
if (cpu_feature_enabled(X86_FEATURE_RETHUNK))
__text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE);
else
- memcpy(ip, retq, sizeof(retq));
+ text_poke_copy(ip, retq, sizeof(retq));
/* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
@@ -374,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
- if (ret < 0)
+ if (!text_poke_copy(ip, x86_nops[2], 2))
goto fail;
}
@@ -388,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
*/
ptr = (unsigned long *)(trampoline + size + RET_SIZE);
- *ptr = (unsigned long)ops;
+ text_poke_copy(ptr, &ops, sizeof(unsigned long));
op_offset -= start_offset;
memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE);
@@ -404,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
op_ptr.offset = offset;
/* put in the new offset to the ftrace_ops */
- memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
/* put in the call to the function */
mutex_lock(&text_mutex);
@@ -414,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
* the depth accounting before the call already.
*/
dest = ftrace_ops_get_func(ops);
- memcpy(trampoline + call_offset,
- text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
- CALL_INSN_SIZE);
+ text_poke_copy_locked(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest),
+ CALL_INSN_SIZE, false);
mutex_unlock(&text_mutex);
/* ALLOC_TRAMP flags lets us know we created it */
@@ -614,16 +607,8 @@ int ftrace_disable_ftrace_graph_caller(void)
}
#endif /* CONFIG_DYNAMIC_FTRACE && !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
-/*
- * Hook the return address and push it in the stack of return addrs
- * in current thread info.
- */
-void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
- unsigned long frame_pointer)
+static inline bool skip_ftrace_return(void)
{
- unsigned long return_hooker = (unsigned long)&return_to_handler;
- int bit;
-
/*
* When resuming from suspend-to-ram, this function can be indirectly
* called from early CPU startup code while the CPU is in real mode,
@@ -633,33 +618,48 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
* This check isn't as accurate as virt_addr_valid(), but it should be
* good enough for this purpose, and it's fast.
*/
- if (unlikely((long)__builtin_frame_address(0) >= 0))
- return;
+ if ((long)__builtin_frame_address(0) >= 0)
+ return true;
- if (unlikely(ftrace_graph_is_dead()))
- return;
+ if (ftrace_graph_is_dead())
+ return true;
- if (unlikely(atomic_read(&current->tracing_graph_pause)))
- return;
+ if (atomic_read(&current->tracing_graph_pause))
+ return true;
+ return false;
+}
- bit = ftrace_test_recursion_trylock(ip, *parent);
- if (bit < 0)
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long ip, unsigned long *parent,
+ unsigned long frame_pointer)
+{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
+
+ if (unlikely(skip_ftrace_return()))
return;
if (!function_graph_enter(*parent, ip, frame_pointer, parent))
*parent = return_hooker;
-
- ftrace_test_recursion_unlock(bit);
}
#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct ftrace_regs *fregs)
{
- struct pt_regs *regs = &fregs->regs;
+ struct pt_regs *regs = &arch_ftrace_regs(fregs)->regs;
unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs);
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
+ unsigned long *parent = (unsigned long *)stack;
- prepare_ftrace_return(ip, (unsigned long *)stack, 0);
+ if (unlikely(skip_ftrace_return()))
+ return;
+
+
+ if (!function_graph_enter_regs(*parent, ip, 0, parent, fregs))
+ *parent = return_hooker;
}
#endif
diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S
index 58d9ed50fe61..f4e0c3361234 100644
--- a/arch/x86/kernel/ftrace_32.S
+++ b/arch/x86/kernel/ftrace_32.S
@@ -187,14 +187,15 @@ SYM_CODE_END(ftrace_graph_caller)
.globl return_to_handler
return_to_handler:
- pushl $0
- pushl %edx
- pushl %eax
+ subl $(PTREGS_SIZE), %esp
+ movl $0, PT_EBP(%esp)
+ movl %edx, PT_EDX(%esp)
+ movl %eax, PT_EAX(%esp)
movl %esp, %eax
call ftrace_return_to_handler
movl %eax, %ecx
- popl %eax
- popl %edx
- addl $4, %esp # skip ebp
+ movl PT_EAX(%esp), %eax
+ movl PT_EDX(%esp), %edx
+ addl $(PTREGS_SIZE), %esp
JMP_NOSPEC ecx
#endif
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 214f30e9f0c0..d51647228596 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -348,21 +348,22 @@ STACK_FRAME_NON_STANDARD_FP(__fentry__)
SYM_CODE_START(return_to_handler)
UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
- subq $24, %rsp
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, 16(%rsp)
+ /* Save ftrace_regs for function exit context */
+ subq $(FRAME_SIZE), %rsp
+
+ movq %rax, RAX(%rsp)
+ movq %rdx, RDX(%rsp)
+ movq %rbp, RBP(%rsp)
movq %rsp, %rdi
call ftrace_return_to_handler
movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
+ movq RDX(%rsp), %rdx
+ movq RAX(%rsp), %rax
- addq $24, %rsp
+ addq $(FRAME_SIZE), %rsp
/*
* Jump back to the old return address. This cannot be JMP_NOSPEC rdi
* since IBT would demand that contain ENDBR, which simply isn't so for
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 05a110c97111..22c9ba305ac1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -22,6 +22,8 @@
#include <linux/cc_platform.h>
#include <linux/pgtable.h>
+#include <asm/asm.h>
+#include <asm/page_64.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
@@ -67,42 +69,11 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif
-/*
- * GDT used on the boot CPU before switching to virtual addresses.
- */
-static struct desc_struct startup_gdt[GDT_ENTRIES] __initdata = {
- [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
- [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
-};
-
-/*
- * Address needs to be set at runtime because it references the startup_gdt
- * while the kernel still uses a direct mapping.
- */
-static struct desc_ptr startup_gdt_descr __initdata = {
- .size = sizeof(startup_gdt)-1,
- .address = 0,
-};
-
-static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+static inline bool check_la57_support(void)
{
- return ptr - (void *)_text + (void *)physaddr;
-}
-
-static unsigned long __head *fixup_long(void *ptr, unsigned long physaddr)
-{
- return fixup_pointer(ptr, physaddr);
-}
-
-#ifdef CONFIG_X86_5LEVEL
-static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
-{
- return fixup_pointer(ptr, physaddr);
-}
+ if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+ return false;
-static bool __head check_la57_support(unsigned long physaddr)
-{
/*
* 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
@@ -110,25 +81,21 @@ static bool __head check_la57_support(unsigned long physaddr)
if (!(native_read_cr4() & X86_CR4_LA57))
return false;
- *fixup_int(&__pgtable_l5_enabled, physaddr) = 1;
- *fixup_int(&pgdir_shift, physaddr) = 48;
- *fixup_int(&ptrs_per_p4d, physaddr) = 512;
- *fixup_long(&page_offset_base, physaddr) = __PAGE_OFFSET_BASE_L5;
- *fixup_long(&vmalloc_base, physaddr) = __VMALLOC_BASE_L5;
- *fixup_long(&vmemmap_base, physaddr) = __VMEMMAP_BASE_L5;
+ RIP_REL_REF(__pgtable_l5_enabled) = 1;
+ RIP_REL_REF(pgdir_shift) = 48;
+ RIP_REL_REF(ptrs_per_p4d) = 512;
+ RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5;
+ RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5;
+ RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5;
return true;
}
-#else
-static bool __head check_la57_support(unsigned long physaddr)
-{
- return false;
-}
-#endif
-static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
+static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
+ pmdval_t *pmd,
+ unsigned long p2v_offset)
{
- unsigned long vaddr, vaddr_end;
+ unsigned long paddr, paddr_end;
int i;
/* Encrypt the kernel and related (if SME is active) */
@@ -141,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* attribute.
*/
if (sme_get_me_mask()) {
- vaddr = (unsigned long)__start_bss_decrypted;
- vaddr_end = (unsigned long)__end_bss_decrypted;
+ paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
+ paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
- for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ for (; paddr < paddr_end; paddr += PMD_SIZE) {
/*
* On SNP, transition the page to shared in the RMP table so that
* it is consistent with the page table attribute change.
@@ -153,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* mapping (kernel .text). PVALIDATE, by way of
* early_snp_set_memory_shared(), requires a valid virtual
* address but the kernel is currently running off of the identity
- * mapping so use __pa() to get a *currently* valid virtual address.
+ * mapping so use the PA to get a *currently* valid virtual address.
*/
- early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
+ early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
- i = pmd_index(vaddr);
+ i = pmd_index(paddr - p2v_offset);
pmd[i] -= sme_get_me_mask();
}
}
@@ -173,23 +140,25 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
- * be adjusted using fixup_pointer().
+ * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
+ * by subtracting p2v_offset from the RIP-relative address.
*/
-unsigned long __head __startup_64(unsigned long physaddr,
+unsigned long __head __startup_64(unsigned long p2v_offset,
struct boot_params *bp)
{
- unsigned long load_delta, *p;
+ pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
+ unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
+ unsigned long va_text, va_end;
unsigned long pgtable_flags;
+ unsigned long load_delta;
pgdval_t *pgd;
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
- pteval_t *mask_ptr;
bool la57;
int i;
- unsigned int *next_pgt_ptr;
- la57 = check_la57_support(physaddr);
+ la57 = check_la57_support();
/* Is the address too large? */
if (physaddr >> MAX_PHYSMEM_BITS)
@@ -199,37 +168,36 @@ unsigned long __head __startup_64(unsigned long physaddr,
* Compute the delta between the address I am compiled to run at
* and the address I am actually running at.
*/
- load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+ load_delta = __START_KERNEL_map + p2v_offset;
+ RIP_REL_REF(phys_base) = load_delta;
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_MASK)
for (;;);
+ va_text = physaddr - p2v_offset;
+ va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
+
/* Include the SME encryption mask in the fixup value */
load_delta += sme_get_me_mask();
/* Fixup the physical addresses in the page table */
- pgd = fixup_pointer(early_top_pgt, physaddr);
- p = pgd + pgd_index(__START_KERNEL_map);
- if (la57)
- *p = (unsigned long)level4_kernel_pgt;
- else
- *p = (unsigned long)level3_kernel_pgt;
- *p += _PAGE_TABLE_NOENC - __START_KERNEL_map + load_delta;
+ pgd = &RIP_REL_REF(early_top_pgt)->pgd;
+ pgd[pgd_index(__START_KERNEL_map)] += load_delta;
- if (la57) {
- p4d = fixup_pointer(level4_kernel_pgt, physaddr);
- p4d[511] += load_delta;
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
+ p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
+ p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
+
+ pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE;
}
- pud = fixup_pointer(level3_kernel_pgt, physaddr);
- pud[510] += load_delta;
- pud[511] += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta;
+ RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta;
- pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
- pmd[i] += load_delta;
+ RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -238,15 +206,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
* it avoids problems around wraparound.
*/
- next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr);
- pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
- pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr);
+ pud = &early_pgts[0]->pmd;
+ pmd = &early_pgts[1]->pmd;
+ RIP_REL_REF(next_early_pgt) = 2;
pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask();
if (la57) {
- p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++],
- physaddr);
+ p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd;
i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
pgd[i + 0] = (pgdval_t)p4d + pgtable_flags;
@@ -267,12 +234,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
- mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
- pmd_entry &= *mask_ptr;
+ pmd_entry &= RIP_REL_REF(__supported_pte_mask);
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;
- for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+ for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
int idx = i + (physaddr >> PMD_SHIFT);
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
@@ -294,14 +260,14 @@ unsigned long __head __startup_64(unsigned long physaddr,
* error, causing the BIOS to halt the system.
*/
- pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+ pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
/* invalidate pages before the kernel image */
- for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ for (i = 0; i < pmd_index(va_text); i++)
pmd[i] &= ~_PAGE_PRESENT;
/* fixup pages that are part of the kernel image */
- for (; i <= pmd_index((unsigned long)_end); i++)
+ for (; i <= pmd_index(va_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
@@ -309,13 +275,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
for (; i < PTRS_PER_PMD; i++)
pmd[i] &= ~_PAGE_PRESENT;
- /*
- * Fixup phys_base - remove the memory encryption mask to obtain
- * the true physical address.
- */
- *fixup_long(&phys_base, physaddr) += load_delta - sme_get_me_mask();
-
- return sme_postprocess_startup(bp, pmd);
+ return sme_postprocess_startup(bp, pmd, p2v_offset);
}
/* Wipe all early page tables except for the kernel symbol map */
@@ -569,62 +529,53 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data)
*/
static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
-static struct desc_ptr bringup_idt_descr = {
- .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1,
- .address = 0, /* Set at runtime */
-};
-
-static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
+/* This may run while still in the direct mapping */
+static void __head startup_64_load_idt(void *vc_handler)
{
-#ifdef CONFIG_AMD_MEM_ENCRYPT
+ struct desc_ptr desc = {
+ .address = (unsigned long)&RIP_REL_REF(bringup_idt_table),
+ .size = sizeof(bringup_idt_table) - 1,
+ };
struct idt_data data;
- gate_desc desc;
-
- init_idt_data(&data, n, handler);
- idt_init_desc(&desc, &data);
- native_write_idt_entry(idt, n, &desc);
-#endif
-}
+ gate_desc idt_desc;
-/* This runs while still in the direct mapping */
-static void __head startup_64_load_idt(unsigned long physbase)
-{
- struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
- gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
-
-
- if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
- void *handler;
-
- /* VMM Communication Exception */
- handler = fixup_pointer(vc_no_ghcb, physbase);
- set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
+ /* @vc_handler is set only for a VMM Communication Exception */
+ if (vc_handler) {
+ init_idt_data(&data, X86_TRAP_VC, vc_handler);
+ idt_init_desc(&idt_desc, &data);
+ native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc);
}
- desc->address = (unsigned long)idt;
- native_load_idt(desc);
+ native_load_idt(&desc);
}
/* This is used when running on kernel addresses */
void early_setup_idt(void)
{
- /* VMM Communication Exception */
+ void *handler = NULL;
+
if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
setup_ghcb();
- set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+ handler = vc_boot_ghcb;
}
- bringup_idt_descr.address = (unsigned long)bringup_idt_table;
- native_load_idt(&bringup_idt_descr);
+ startup_64_load_idt(handler);
}
/*
* Setup boot CPU state needed before kernel switches to virtual addresses.
*/
-void __head startup_64_setup_env(unsigned long physbase)
+void __head startup_64_setup_gdt_idt(void)
{
+ struct desc_struct *gdt = (void *)(__force unsigned long)init_per_cpu_var(gdt_page.gdt);
+ void *handler = NULL;
+
+ struct desc_ptr startup_gdt_descr = {
+ .address = (unsigned long)&RIP_REL_REF(*gdt),
+ .size = GDT_SIZE - 1,
+ };
+
/* Load GDT */
- startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
native_load_gdt(&startup_gdt_descr);
/* New GDT is live - reload data segment registers */
@@ -632,5 +583,8 @@ void __head startup_64_setup_env(unsigned long physbase)
"movl %%eax, %%ss\n"
"movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
- startup_64_load_idt(physbase);
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ handler = &RIP_REL_REF(vc_no_ghcb);
+
+ startup_64_load_idt(handler);
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 487ac57e2c81..2e42056d2306 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -44,9 +44,6 @@
#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
-
-#define SIZEOF_PTREGS 17*4
-
/*
* Worst-case size of the kernel mapping we need to make:
* a relocatable kernel can live anywhere in lowmem, so we need to be able
@@ -414,7 +411,7 @@ __REFDATA
.align 4
SYM_DATA(initial_code, .long i386_start_kernel)
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
#define PGD_ALIGN (2 * PAGE_SIZE)
#define PTI_USER_PGD_FILL 1024
#else
@@ -474,7 +471,7 @@ SYM_DATA_START(initial_page_table)
# endif
.align PAGE_SIZE /* needs to be page-sized too */
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* PTI needs another page so sync_initial_pagetable() works correctly
* and does not scribble over the data which is placed behind the
@@ -488,19 +485,13 @@ SYM_DATA_END(initial_page_table)
.data
.balign 4
-/*
- * The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
- * reliably detect the end of the stack.
- */
-SYM_DATA(initial_stack,
- .long init_thread_union + THREAD_SIZE -
- SIZEOF_PTREGS - TOP_OF_KERNEL_STACK_PADDING)
+SYM_DATA(initial_stack, .long __top_init_kernel_stack)
__INITRODATA
int_msg:
.asciz "Unknown interrupt or fault at: %p %p %p\n"
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0f8103240fda..31345e0ba006 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -26,20 +26,13 @@
#include <asm/apicdef.h>
#include <asm/fixmap.h>
#include <asm/smp.h>
+#include <asm/thread_info.h>
/*
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
*/
-#define l4_index(x) (((x) >> 39) & 511)
-#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
-L4_START_KERNEL = l4_index(__START_KERNEL_map)
-
-L3_START_KERNEL = pud_index(__START_KERNEL_map)
-
- .text
__HEAD
.code64
SYM_CODE_START_NOALIGN(startup_64)
@@ -66,9 +59,7 @@ SYM_CODE_START_NOALIGN(startup_64)
mov %rsi, %r15
/* Set up the stack for verify_cpu() */
- leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
-
- leaq _text(%rip), %rdi
+ leaq __top_init_kernel_stack(%rip), %rsp
/* Setup GSBASE to allow stack canary access for C code */
movl $MSR_GS_BASE, %ecx
@@ -77,7 +68,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
- call startup_64_setup_env
+ call startup_64_setup_gdt_idt
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@@ -86,6 +77,7 @@ SYM_CODE_START_NOALIGN(startup_64)
lretq
.Lon_kernel_cs:
+ ANNOTATE_NOENDBR
UNWIND_HINT_END_OF_STACK
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -103,20 +95,52 @@ SYM_CODE_START_NOALIGN(startup_64)
call verify_cpu
/*
+ * Derive the kernel's physical-to-virtual offset from the physical and
+ * virtual addresses of common_startup_64().
+ */
+ leaq common_startup_64(%rip), %rdi
+ subq .Lcommon_startup_64(%rip), %rdi
+
+ /*
* Perform pagetable fixups. Additionally, if SME is active, encrypt
* the kernel and retrieve the modifier (SME encryption mask if SME
* is active) to be added to the initial pgdir entry that will be
* programmed into CR3.
*/
- leaq _text(%rip), %rdi
movq %r15, %rsi
call __startup_64
/* Form the CR3 value being sure to include the CR3 modifier */
- addq $(early_top_pgt - __START_KERNEL_map), %rax
- jmp 1f
+ leaq early_top_pgt(%rip), %rcx
+ addq %rcx, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ mov %rax, %rdi
+
+ /*
+ * For SEV guests: Verify that the C-bit is correct. A malicious
+ * hypervisor could lie about the C-bit position to perform a ROP
+ * attack on the guest by writing to the unencrypted stack and wait for
+ * the next RET instruction.
+ */
+ call sev_verify_cbit
+#endif
+
+ /*
+ * Switch to early_top_pgt which still has the identity mappings
+ * present.
+ */
+ movq %rax, %cr3
+
+ /* Branch to the common startup code at its kernel virtual address */
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *.Lcommon_startup_64(%rip)
SYM_CODE_END(startup_64)
+ __INITRODATA
+SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64)
+
+ .text
SYM_CODE_START(secondary_startup_64)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
@@ -149,22 +173,39 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Clear %R15 which holds the boot_params pointer on the boot CPU */
- xorq %r15, %r15
+ xorl %r15d, %r15d
+
+ /* Derive the runtime physical address of init_top_pgt[] */
+ movq phys_base(%rip), %rax
+ addq $(init_top_pgt - __START_KERNEL_map), %rax
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
#ifdef CONFIG_AMD_MEM_ENCRYPT
- movq sme_me_mask, %rax
-#else
- xorq %rax, %rax
+ addq sme_me_mask(%rip), %rax
#endif
+ /*
+ * Switch to the init_top_pgt here, away from the trampoline_pgd and
+ * unmap the identity mapped ranges.
+ */
+ movq %rax, %cr3
- /* Form the CR3 value being sure to include the CR3 modifier */
- addq $(init_top_pgt - __START_KERNEL_map), %rax
-1:
+SYM_INNER_LABEL(common_startup_64, SYM_L_LOCAL)
+ UNWIND_HINT_END_OF_STACK
+ ANNOTATE_NOENDBR
+ /*
+ * Create a mask of CR4 bits to preserve. Omit PGE in order to flush
+ * global 1:1 translations from the TLBs.
+ *
+ * From the SDM:
+ * "If CR4.PGE is changing from 0 to 1, there were no global TLB
+ * entries before the execution; if CR4.PGE is changing from 1 to 0,
+ * there will be no global TLB entries after the execution."
+ */
+ movl $(X86_CR4_PAE | X86_CR4_LA57), %edx
#ifdef CONFIG_X86_MCE
/*
* Preserve CR4.MCE if the kernel will enable #MC support.
@@ -173,61 +214,20 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* configured will crash the system regardless of the CR4.MCE value set
* here.
*/
- movq %cr4, %rcx
- andl $X86_CR4_MCE, %ecx
-#else
- movl $0, %ecx
+ orl $X86_CR4_MCE, %edx
#endif
+ movq %cr4, %rcx
+ andl %edx, %ecx
- /* Enable PAE mode, PSE, PGE and LA57 */
- orl $(X86_CR4_PAE | X86_CR4_PSE | X86_CR4_PGE), %ecx
-#ifdef CONFIG_X86_5LEVEL
- testl $1, __pgtable_l5_enabled(%rip)
- jz 1f
- orl $X86_CR4_LA57, %ecx
-1:
-#endif
+ /* Even if ignored in long mode, set PSE uniformly on all logical CPUs. */
+ btsl $X86_CR4_PSE_BIT, %ecx
movq %rcx, %cr4
- /* Setup early boot stage 4-/5-level pagetables. */
- addq phys_base(%rip), %rax
-
- /*
- * For SEV guests: Verify that the C-bit is correct. A malicious
- * hypervisor could lie about the C-bit position to perform a ROP
- * attack on the guest by writing to the unencrypted stack and wait for
- * the next RET instruction.
- */
- movq %rax, %rdi
- call sev_verify_cbit
-
- /*
- * Switch to new page-table
- *
- * For the boot CPU this switches to early_top_pgt which still has the
- * indentity mappings present. The secondary CPUs will switch to the
- * init_top_pgt here, away from the trampoline_pgd and unmap the
- * indentity mapped ranges.
- */
- movq %rax, %cr3
-
/*
- * Do a global TLB flush after the CR3 switch to make sure the TLB
- * entries from the identity mapping are flushed.
+ * Set CR4.PGE to re-enable global translations.
*/
- movq %cr4, %rcx
- movq %rcx, %rax
- xorq $X86_CR4_PGE, %rcx
+ btsl $X86_CR4_PGE_BIT, %ecx
movq %rcx, %cr4
- movq %rax, %cr4
-
- /* Ensure I am executing from virtual addresses */
- movq $1f, %rax
- ANNOTATE_RETPOLINE_SAFE
- jmp *%rax
-1:
- UNWIND_HINT_END_OF_STACK
- ANNOTATE_NOENDBR // above
#ifdef CONFIG_SMP
/*
@@ -284,7 +284,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
.Llookup_AP:
/* EAX contains the APIC ID of the current CPU */
- xorq %rcx, %rcx
+ xorl %ecx, %ecx
leaq cpuid_to_apicid(%rip), %rbx
.Lfind_cpunr:
@@ -415,39 +415,10 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
movq %r15, %rdi
.Ljump_to_C_code:
- /*
- * Jump to run C code and to be on a real kernel address.
- * Since we are running on identity-mapped space we have to jump
- * to the full 64bit address, this is only possible as indirect
- * jump. In addition we need to ensure %cs is set so we make this
- * a far return.
- *
- * Note: do not change to far jump indirect with 64bit offset.
- *
- * AMD does not support far jump indirect with 64bit offset.
- * AMD64 Architecture Programmer's Manual, Volume 3: states only
- * JMP FAR mem16:16 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- * JMP FAR mem16:32 FF /5 Far jump indirect,
- * with the target specified by a far pointer in memory.
- *
- * Intel64 does support 64bit offset.
- * Software Developer Manual Vol 2: states:
- * FF /5 JMP m16:16 Jump far, absolute indirect,
- * address given in m16:16
- * FF /5 JMP m16:32 Jump far, absolute indirect,
- * address given in m16:32.
- * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
- * address given in m16:64.
- */
- pushq $.Lafter_lret # put return address on stack for unwinder
xorl %ebp, %ebp # clear frame pointer
- movq initial_code(%rip), %rax
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
-.Lafter_lret:
- ANNOTATE_NOENDBR
+ ANNOTATE_RETPOLINE_SAFE
+ callq *initial_code(%rip)
+ ud2
SYM_CODE_END(secondary_startup_64)
#include "verify_cpu.S"
@@ -464,7 +435,7 @@ SYM_CODE_START(soft_restart_cpu)
UNWIND_HINT_END_OF_STACK
/* Find the idle task stack */
- movq PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+ movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
movq TASK_threadsp(%rcx), %rsp
jmp .Ljump_to_C_code
@@ -606,10 +577,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb)
SYM_CODE_END(vc_no_ghcb)
#endif
-#define SYM_DATA_START_PAGE_ALIGNED(name) \
- SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
-
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
@@ -630,19 +598,12 @@ SYM_CODE_END(vc_no_ghcb)
#define PTI_USER_PGD_FILL 0
#endif
-/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
- i = i + 1 ; \
- .endr
-
__INITDATA
.balign 4
SYM_DATA_START_PTI_ALIGNED(early_top_pgt)
- .fill 512,8,0
+ .fill 511,8,0
+ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
SYM_DATA_END(early_top_pgt)
@@ -736,8 +697,6 @@ SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt)
.endr
SYM_DATA_END(level1_fixmap_pgt)
-#undef PMDS
-
.data
.align 16
@@ -748,7 +707,7 @@ SYM_DATA(smpboot_control, .long 0)
SYM_DATA(phys_base, .quad 0x0)
EXPORT_SYMBOL(phys_base)
-#include "../../x86/xen/xen-head.S"
+#include "../xen/xen-head.S"
__PAGE_ALIGNED_BSS
SYM_DATA_START_PAGE_ALIGNED(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 41eecf180b7f..7f4b2966e15c 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,6 +7,7 @@
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <asm/cpuid.h>
#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -516,22 +517,14 @@ static int hpet_msi_init(struct irq_domain *domain,
struct msi_domain_info *info, unsigned int virq,
irq_hw_number_t hwirq, msi_alloc_info_t *arg)
{
- irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
handle_edge_irq, arg->data, "edge");
return 0;
}
-static void hpet_msi_free(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq)
-{
- irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
-}
-
static struct msi_domain_ops hpet_msi_domain_ops = {
.msi_init = hpet_msi_init,
- .msi_free = hpet_msi_free,
};
static struct msi_domain_info hpet_msi_domain_info = {
@@ -568,7 +561,7 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id)
fwspec.param_count = 1;
fwspec.param[0] = hpet_id;
- parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_GENERIC_MSI);
if (!parent) {
irq_domain_free_fwnode(fn);
kfree(domain_info);
@@ -707,7 +700,7 @@ static void __init hpet_select_clockevents(void)
hpet_base.nr_clockevents = 0;
- /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */
+ /* No point if MSI is disabled or CPU has an Always Running APIC Timer */
if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT))
return;
@@ -927,10 +920,7 @@ static bool __init mwait_pc10_supported(void)
if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
return false;
- if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
- return false;
-
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates);
return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
(ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
@@ -965,7 +955,7 @@ static bool __init mwait_pc10_supported(void)
* and per CPU timer interrupts.
*
* The probability that this problem is going to be solved in the
- * forseeable future is close to zero, so the kernel has to be cluttered
+ * foreseeable future is close to zero, so the kernel has to be cluttered
* with heuristics to keep up with the ever growing amount of hardware and
* firmware trainwrecks. Hopefully some day hardware people will understand
* that the approach of "This can be fixed in software" is not sustainable.
@@ -1392,12 +1382,6 @@ int hpet_set_periodic_freq(unsigned long freq)
}
EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
-int hpet_rtc_dropped_irq(void)
-{
- return is_hpet_enabled();
-}
-EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
-
static void hpet_rtc_timer_reinit(void)
{
unsigned int delta;
@@ -1438,7 +1422,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
memset(&curr_time, 0, sizeof(struct rtc_time));
if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
- if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+ if (unlikely(mc146818_get_time(&curr_time, 10) < 0)) {
pr_err_ratelimited("unable to read current time from RTC\n");
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2b7999a1a50a..80e262bb627f 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -8,6 +8,7 @@
#include <linux/timex.h>
#include <linux/i8253.h>
+#include <asm/hypervisor.h>
#include <asm/apic.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -39,9 +40,15 @@ static bool __init use_pit(void)
bool __init pit_timer_init(void)
{
- if (!use_pit())
+ if (!use_pit()) {
+ /*
+ * Don't just ignore the PIT. Ensure it's stopped, because
+ * VMMs otherwise steal CPU time just to pointlessly waggle
+ * the (masked) IRQ.
+ */
+ clockevent_i8253_disable();
return false;
-
+ }
clockevent_i8253_init(true);
global_clock_event = &i8253_clockevent;
return true;
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 660b601f1d6c..f445bec516a0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_X86_LOCAL_APIC
INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
-# ifdef CONFIG_HAVE_KVM
+# if IS_ENABLED(CONFIG_KVM)
INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
@@ -163,6 +163,9 @@ static const __initconst struct idt_data apic_idts[] = {
# endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
+# ifdef CONFIG_X86_POSTED_MSI
+ INTG(POSTED_MSI_NOTIFICATION_VECTOR, asm_sysvec_posted_msi_notification),
+# endif
#endif
};
@@ -337,7 +340,7 @@ void idt_invalidate(void)
load_idt(&idt);
}
-void __init alloc_intr_gate(unsigned int n, const void *addr)
+void __init idt_install_sysvec(unsigned int n, const void *function)
{
if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
return;
@@ -346,5 +349,5 @@ void __init alloc_intr_gate(unsigned int n, const void *addr)
return;
if (!WARN_ON(test_and_set_bit(n, system_vectors)))
- set_intr_gate(n, addr);
+ set_intr_gate(n, function);
}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 11761c124545..feca4f20b06a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,9 +22,13 @@
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/thermal.h>
+#include <asm/posted_intr.h>
+#include <asm/irq_remapping.h>
+#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR)
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
+#endif
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -164,7 +168,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
seq_printf(p, "%*s: ", prec, "PIN");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
@@ -182,6 +186,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
irq_stats(j)->kvm_posted_intr_wakeup_ipis);
seq_puts(p, " Posted-interrupt wakeup event\n");
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+ seq_printf(p, "%*s: ", prec, "PMN");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->posted_msi_notification_count);
+ seq_puts(p, " Posted MSI notification event\n");
+#endif
return 0;
}
@@ -240,24 +251,16 @@ static __always_inline void handle_irq(struct irq_desc *desc,
__handle_irq(desc, regs);
}
-/*
- * common_interrupt() handles all normal device IRQ's (the special SMP
- * cross-CPU interrupts have their own entry points).
- */
-DEFINE_IDTENTRY_IRQ(common_interrupt)
+static __always_inline int call_irq_handler(int vector, struct pt_regs *regs)
{
- struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
-
- /* entry code tells RCU that we're not quiescent. Check it. */
- RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+ int ret = 0;
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
handle_irq(desc, regs);
} else {
- apic_eoi();
-
+ ret = -EINVAL;
if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
@@ -267,6 +270,23 @@ DEFINE_IDTENTRY_IRQ(common_interrupt)
}
}
+ return ret;
+}
+
+/*
+ * common_interrupt() handles all normal device IRQ's (the special SMP
+ * cross-CPU interrupts have their own entry points).
+ */
+DEFINE_IDTENTRY_IRQ(common_interrupt)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ /* entry code tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ if (unlikely(call_irq_handler(vector, regs)))
+ apic_eoi();
+
set_irq_regs(old_regs);
}
@@ -290,7 +310,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
}
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
static void dummy_handler(void) {}
static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
@@ -334,12 +354,139 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
}
#endif
+#ifdef CONFIG_X86_POSTED_MSI
+
+/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
+DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+
+void intel_posted_msi_init(void)
+{
+ u32 destination;
+ u32 apic_id;
+
+ this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
+
+ /*
+ * APIC destination ID is stored in bit 8:15 while in XAPIC mode.
+ * VT-d spec. CH 9.11
+ */
+ apic_id = this_cpu_read(x86_cpu_to_apicid);
+ destination = x2apic_enabled() ? apic_id : apic_id << 8;
+ this_cpu_write(posted_msi_pi_desc.ndst, destination);
+}
+
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
+ * for checking and clearing posted interrupt request (PIR), a 256 bit field
+ * within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * to dispatch interrupt handlers.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+{
+ int i, vec = FIRST_EXTERNAL_VECTOR;
+ unsigned long pir_copy[4];
+ bool handled = false;
+
+ for (i = 0; i < 4; i++)
+ pir_copy[i] = pir[i];
+
+ for (i = 0; i < 4; i++) {
+ if (!pir_copy[i])
+ continue;
+
+ pir_copy[i] = arch_xchg(&pir[i], 0);
+ handled = true;
+ }
+
+ if (handled) {
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
+ }
+
+ return handled;
+}
+
+/*
+ * Performance data shows that 3 is good enough to harvest 90+% of the benefit
+ * on high IRQ rate workload.
+ */
+#define MAX_POSTED_MSI_COALESCING_LOOP 3
+
+/*
+ * For MSIs that are delivered as posted interrupts, the CPU notifications
+ * can be coalesced if the MSIs arrive in high frequency bursts.
+ */
+DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ struct pi_desc *pid;
+ int i = 0;
+
+ pid = this_cpu_ptr(&posted_msi_pi_desc);
+
+ inc_irq_stat(posted_msi_notification_count);
+ irq_enter();
+
+ /*
+ * Max coalescing count includes the extra round of handle_pending_pir
+ * after clearing the outstanding notification bit. Hence, at most
+ * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ */
+ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ if (!handle_pending_pir(pid->pir64, regs))
+ break;
+ }
+
+ /*
+ * Clear outstanding notification bit to allow new IRQ notifications,
+ * do this last to maximize the window of interrupt coalescing.
+ */
+ pi_clear_on(pid);
+
+ /*
+ * There could be a race of PI notification and the clearing of ON bit,
+ * process PIR bits one last time such that handling the new interrupts
+ * are not delayed until the next IRQ.
+ */
+ handle_pending_pir(pid->pir64, regs);
+
+ apic_eoi();
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+#endif /* X86_POSTED_MSI */
#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
- unsigned int irr, vector;
+ unsigned int vector;
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
@@ -366,8 +513,7 @@ void fixup_irqs(void)
if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- if (irr & (1 << (vector % 32))) {
+ if (is_vector_pending(vector)) {
desc = __this_cpu_read(vector_irq[vector]);
raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fe0c859873d1..ade0043ce56e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
#include <asm/cpu_entry_area.h>
#include <asm/softirq_stack.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c683666876f1..f79c5edc0b89 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
+#include <asm/fred.h>
#include <asm/prom.h>
/*
@@ -96,7 +97,11 @@ void __init native_init_IRQ(void)
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();
- idt_setup_apic_and_irq_gates();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_complete_exception_setup();
+ else
+ idt_setup_apic_and_irq_gates();
+
lapic_assign_system_vectors();
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 9a7c03d47861..9cea1fc36c18 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
+#include <linux/debugfs.h>
#include <linux/mutex.h>
#include <linux/sysctl.h>
#include <linux/nodemask.h>
@@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable;
* of higher turbo frequency for cpus supporting Intel Turbo Boost Max
* Technology 3.0.
*
- * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
*/
-unsigned int __read_mostly sysctl_sched_itmt_enabled;
+bool __read_mostly sysctl_sched_itmt_enabled;
-static int sched_itmt_update_handler(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static ssize_t sched_itmt_enabled_write(struct file *filp,
+ const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned int old_sysctl;
- int ret;
+ ssize_t result;
+ bool orig;
- mutex_lock(&itmt_update_mutex);
+ guard(mutex)(&itmt_update_mutex);
- if (!sched_itmt_capable) {
- mutex_unlock(&itmt_update_mutex);
- return -EINVAL;
- }
-
- old_sysctl = sysctl_sched_itmt_enabled;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ orig = sysctl_sched_itmt_enabled;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
- if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ if (sysctl_sched_itmt_enabled != orig) {
x86_topology_update = true;
rebuild_sched_domains();
}
- mutex_unlock(&itmt_update_mutex);
-
- return ret;
+ return result;
}
-static struct ctl_table itmt_kern_table[] = {
- {
- .procname = "sched_itmt_enabled",
- .data = &sysctl_sched_itmt_enabled,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_itmt_update_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
+static const struct file_operations dfs_sched_itmt_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_itmt_enabled_write,
+ .open = simple_open,
+ .llseek = default_llseek,
};
-static struct ctl_table_header *itmt_sysctl_header;
+static struct dentry *dfs_sched_itmt;
/**
* sched_set_itmt_support() - Indicate platform supports ITMT
@@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header;
*/
int sched_set_itmt_support(void)
{
- mutex_lock(&itmt_update_mutex);
+ guard(mutex)(&itmt_update_mutex);
- if (sched_itmt_capable) {
- mutex_unlock(&itmt_update_mutex);
+ if (sched_itmt_capable)
return 0;
- }
- itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
- if (!itmt_sysctl_header) {
- mutex_unlock(&itmt_update_mutex);
+ dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
+ 0644,
+ arch_debugfs_dir,
+ &sysctl_sched_itmt_enabled,
+ &dfs_sched_itmt_fops);
+ if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
+ dfs_sched_itmt = NULL;
return -ENOMEM;
}
@@ -117,8 +109,6 @@ int sched_set_itmt_support(void)
x86_topology_update = true;
rebuild_sched_domains();
- mutex_unlock(&itmt_update_mutex);
-
return 0;
}
@@ -134,18 +124,15 @@ int sched_set_itmt_support(void)
*/
void sched_clear_itmt_support(void)
{
- mutex_lock(&itmt_update_mutex);
+ guard(mutex)(&itmt_update_mutex);
- if (!sched_itmt_capable) {
- mutex_unlock(&itmt_update_mutex);
+ if (!sched_itmt_capable)
return;
- }
+
sched_itmt_capable = false;
- if (itmt_sysctl_header) {
- unregister_sysctl_table(itmt_sysctl_header);
- itmt_sysctl_header = NULL;
- }
+ debugfs_remove(dfs_sched_itmt);
+ dfs_sched_itmt = NULL;
if (sysctl_sched_itmt_enabled) {
/* disable sched_itmt if we are no longer ITMT capable */
@@ -153,8 +140,6 @@ void sched_clear_itmt_support(void)
x86_topology_update = true;
rebuild_sched_domains();
}
-
- mutex_unlock(&itmt_update_mutex);
}
int arch_asym_cpu_priority(int cpu)
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index 578d16fc040f..cd8ed1edbf9e 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/reboot.h>
#include <linux/serial_8250.h>
+#include <linux/acpi.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/acpi.h>
@@ -89,7 +90,7 @@ static void __init jailhouse_x2apic_init(void)
#endif
}
-static void __init jailhouse_get_smp_config(unsigned int early)
+static void __init jailhouse_parse_smp_config(void)
{
struct ioapic_domain_cfg ioapic_cfg = {
.type = IOAPIC_DOMAIN_STRICT,
@@ -102,7 +103,7 @@ static void __init jailhouse_get_smp_config(unsigned int early)
register_lapic_address(0xfee00000);
for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++)
- generic_processor_info(setup_data.v1.cpu_ids[cpu]);
+ topology_register_apic(setup_data.v1.cpu_ids[cpu], CPU_ACPIID_INVALID, true);
smp_found_config = 1;
@@ -201,21 +202,23 @@ static void __init jailhouse_init_platform(void)
struct setup_data header;
void *mapping;
- x86_init.irqs.pre_vector_init = x86_init_noop;
- x86_init.timers.timer_init = jailhouse_timer_init;
- x86_init.mpparse.get_smp_config = jailhouse_get_smp_config;
- x86_init.pci.arch_init = jailhouse_pci_arch_init;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+ x86_init.timers.timer_init = jailhouse_timer_init;
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+ x86_init.mpparse.parse_smp_cfg = jailhouse_parse_smp_config;
+ x86_init.pci.arch_init = jailhouse_pci_arch_init;
- x86_platform.calibrate_cpu = jailhouse_get_tsc;
- x86_platform.calibrate_tsc = jailhouse_get_tsc;
- x86_platform.get_wallclock = jailhouse_get_wallclock;
- x86_platform.legacy.rtc = 0;
- x86_platform.legacy.warm_reset = 0;
- x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+ x86_platform.calibrate_cpu = jailhouse_get_tsc;
+ x86_platform.calibrate_tsc = jailhouse_get_tsc;
+ x86_platform.get_wallclock = jailhouse_get_wallclock;
+ x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.warm_reset = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
- legacy_pic = &null_legacy_pic;
+ legacy_pic = &null_legacy_pic;
- machine_ops.emergency_restart = jailhouse_no_restart;
+ machine_ops.emergency_restart = jailhouse_no_restart;
while (pa_data) {
mapping = early_memremap(pa_data, sizeof(header));
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index a61c12c01270..68530fad05f7 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -82,7 +82,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
cmdline_ptr[cmdline_len - 1] = '\0';
- pr_debug("Final command line is: %s\n", cmdline_ptr);
+ kexec_dprintk("Final command line is: %s\n", cmdline_ptr);
cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
cmdline_ext_32 = cmdline_ptr_phys >> 32;
@@ -263,16 +263,23 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
memset(&params->hd0_info, 0, sizeof(params->hd0_info));
memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+#ifdef CONFIG_CRASH_DUMP
if (image->type == KEXEC_TYPE_CRASH) {
ret = crash_setup_memmap_entries(image, params);
if (ret)
return ret;
} else
+#endif
setup_e820_entries(params);
nr_e820_entries = params->e820_entries;
+ kexec_dprintk("E820 memmap:\n");
for (i = 0; i < nr_e820_entries; i++) {
+ kexec_dprintk("%016llx-%016llx (%d)\n",
+ params->e820_table[i].addr,
+ params->e820_table[i].addr + params->e820_table[i].size - 1,
+ params->e820_table[i].type);
if (params->e820_table[i].type != E820_TYPE_RAM)
continue;
start = params->e820_table[i].addr;
@@ -424,16 +431,18 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
* command line. Make sure it does not overflow
*/
if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
- pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ pr_err("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
return ERR_PTR(-EINVAL);
}
+#ifdef CONFIG_CRASH_DUMP
/* Allocate and load backup region */
if (image->type == KEXEC_TYPE_CRASH) {
ret = crash_load_segments(image);
if (ret)
return ERR_PTR(ret);
}
+#endif
/*
* Load purgatory. For 64bit entry point, purgatory code can be
@@ -445,7 +454,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
return ERR_PTR(ret);
}
- pr_debug("Loaded purgatory at 0x%lx\n", pbuf.mem);
+ kexec_dprintk("Loaded purgatory at 0x%lx\n", pbuf.mem);
/*
@@ -490,23 +499,26 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
if (ret)
goto out_free_params;
bootparam_load_addr = kbuf.mem;
- pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
+ kexec_dprintk("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load kernel */
kbuf.buffer = kernel + kern16_size;
kbuf.bufsz = kernel_len - kern16_size;
kbuf.memsz = PAGE_ALIGN(header->init_size);
kbuf.buf_align = header->kernel_alignment;
- kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ if (header->pref_address < MIN_KERNEL_LOAD_ADDR)
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ else
+ kbuf.buf_min = header->pref_address;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
kernel_load_addr = kbuf.mem;
- pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kbuf.bufsz, kbuf.memsz);
+ kexec_dprintk("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
@@ -520,8 +532,8 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
goto out_free_params;
initrd_load_addr = kbuf.mem;
- pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- initrd_load_addr, initrd_len, initrd_len);
+ kexec_dprintk("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
setup_initrd(params, initrd_load_addr, initrd_len);
}
diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index c993521d4933..e772276f5aa9 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -78,7 +78,7 @@
#endif
/* Ensure if the instruction can be boostable */
-extern int can_boost(struct insn *insn, void *orig_addr);
+extern bool can_boost(struct insn *insn, void *orig_addr);
/* Recover instruction if given address is probed */
extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
unsigned long addr);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index a0ce46c0a2d8..72e6a45e7ec2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,12 +40,12 @@
#include <linux/kgdb.h>
#include <linux/ftrace.h>
#include <linux/kasan.h>
-#include <linux/moduleloader.h>
#include <linux/objtool.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <linux/set_memory.h>
#include <linux/cfi.h>
+#include <linux/execmem.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -137,14 +137,14 @@ NOKPROBE_SYMBOL(synthesize_relcall);
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int can_boost(struct insn *insn, void *addr)
+bool can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
insn_byte_t prefix;
int i;
if (search_exception_tables((unsigned long)addr))
- return 0; /* Page fault may occur on this address. */
+ return false; /* Page fault may occur on this address. */
/* 2nd-byte opcode */
if (insn->opcode.nbytes == 2)
@@ -152,7 +152,7 @@ int can_boost(struct insn *insn, void *addr)
(unsigned long *)twobyte_is_boostable);
if (insn->opcode.nbytes != 1)
- return 0;
+ return false;
for_each_insn_prefix(insn, i, prefix) {
insn_attr_t attr;
@@ -160,7 +160,7 @@ int can_boost(struct insn *insn, void *addr)
attr = inat_get_opcode_attribute(prefix);
/* Can't boost Address-size override prefix and CS override prefix */
if (prefix == 0x2e || inat_is_address_size_prefix(attr))
- return 0;
+ return false;
}
opcode = insn->opcode.bytes[0];
@@ -169,24 +169,35 @@ int can_boost(struct insn *insn, void *addr)
case 0x62: /* bound */
case 0x70 ... 0x7f: /* Conditional jumps */
case 0x9a: /* Call far */
- case 0xc0 ... 0xc1: /* Grp2 */
case 0xcc ... 0xce: /* software exceptions */
- case 0xd0 ... 0xd3: /* Grp2 */
case 0xd6: /* (UD) */
case 0xd8 ... 0xdf: /* ESC */
case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
case 0xe8 ... 0xe9: /* near Call, JMP */
case 0xeb: /* Short JMP */
case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ /* ... are not boostable */
+ return false;
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ /*
+ * AMD uses nnn == 110 as SHL/SAL, but Intel makes it reserved.
+ */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b110;
case 0xf6 ... 0xf7: /* Grp3 */
+ /* AMD uses nnn == 001 as TEST, but Intel makes it reserved. */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) != 0b001;
case 0xfe: /* Grp4 */
- /* ... are not boostable */
- return 0;
+ /* Only INC and DEC are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001;
case 0xff: /* Grp5 */
- /* Only indirect jmp is boostable */
- return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
+ /* Only INC, DEC, and indirect JMP are boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 0b000 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b001 ||
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0b100;
default:
- return 1;
+ return true;
}
}
@@ -252,21 +263,40 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add
return __recover_probed_insn(buf, addr);
}
-/* Check if paddr is at an instruction boundary */
-static int can_probe(unsigned long paddr)
+/* Check if insn is INT or UD */
+static inline bool is_exception_insn(struct insn *insn)
+{
+ /* UD uses 0f escape */
+ if (insn->opcode.bytes[0] == 0x0f) {
+ /* UD0 / UD1 / UD2 */
+ return insn->opcode.bytes[1] == 0xff ||
+ insn->opcode.bytes[1] == 0xb9 ||
+ insn->opcode.bytes[1] == 0x0b;
+ }
+
+ /* INT3 / INT n / INTO / INT1 */
+ return insn->opcode.bytes[0] == 0xcc ||
+ insn->opcode.bytes[0] == 0xcd ||
+ insn->opcode.bytes[0] == 0xce ||
+ insn->opcode.bytes[0] == 0xf1;
+}
+
+/*
+ * Check if paddr is at an instruction boundary and that instruction can
+ * be probed
+ */
+static bool can_probe(unsigned long paddr)
{
unsigned long addr, __addr, offset = 0;
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
- return 0;
+ return false;
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
- int ret;
-
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
@@ -277,11 +307,10 @@ static int can_probe(unsigned long paddr)
*/
__addr = recover_probed_instruction(buf, addr);
if (!__addr)
- return 0;
+ return false;
- ret = insn_decode_kernel(&insn, (void *)__addr);
- if (ret < 0)
- return 0;
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
#ifdef CONFIG_KGDB
/*
@@ -290,10 +319,26 @@ static int can_probe(unsigned long paddr)
*/
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE &&
kgdb_has_hit_break(addr))
- return 0;
+ return false;
#endif
addr += insn.length;
}
+
+ /* Check if paddr is at an instruction boundary */
+ if (addr != paddr)
+ return false;
+
+ __addr = recover_probed_instruction(buf, addr);
+ if (!__addr)
+ return false;
+
+ if (insn_decode_kernel(&insn, (void *)__addr) < 0)
+ return false;
+
+ /* INT and UD are special and should not be kprobed */
+ if (is_exception_insn(&insn))
+ return false;
+
if (IS_ENABLED(CONFIG_CFI_CLANG)) {
/*
* The compiler generates the following instruction sequence
@@ -308,13 +353,6 @@ static int can_probe(unsigned long paddr)
* Also, these movl and addl are used for showing expected
* type. So those must not be touched.
*/
- __addr = recover_probed_instruction(buf, addr);
- if (!__addr)
- return 0;
-
- if (insn_decode_kernel(&insn, (void *)__addr) < 0)
- return 0;
-
if (insn.opcode.value == 0xBA)
offset = 12;
else if (insn.opcode.value == 0x3)
@@ -324,18 +362,27 @@ static int can_probe(unsigned long paddr)
/* This movl/addl is used for decoding CFI. */
if (is_cfi_trap(addr + offset))
- return 0;
+ return false;
}
out:
- return (addr == paddr);
+ return true;
}
/* If x86 supports IBT (ENDBR) it must be skipped. */
kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
bool *on_func_entry)
{
- if (is_endbr(*(u32 *)addr)) {
+ u32 insn;
+
+ /*
+ * Since 'addr' is not guaranteed to be safe to access, use
+ * copy_from_kernel_nofault() to read the instruction:
+ */
+ if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32)))
+ return NULL;
+
+ if (is_endbr(insn)) {
*on_func_entry = !offset || offset == 4;
if (*on_func_entry)
offset = 4;
@@ -448,7 +495,7 @@ void *alloc_insn_page(void)
{
void *page;
- page = module_alloc(PAGE_SIZE);
+ page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
if (!page)
return NULL;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index dd2ec14adb77..2be55ec3f392 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -9,6 +9,7 @@
#include <linux/hardirq.h>
#include <linux/preempt.h>
#include <linux/ftrace.h>
+#include <asm/text-patching.h>
#include "common.h"
@@ -21,6 +22,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct kprobe_ctlblk *kcb;
int bit;
+ if (unlikely(kprobe_ftrace_disabled))
+ return;
+
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (bit < 0)
return;
@@ -33,23 +37,25 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
if (kprobe_running()) {
kprobes_inc_nmissed_count(p);
} else {
- unsigned long orig_ip = regs->ip;
+ unsigned long orig_ip = instruction_pointer(regs);
+
/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
- regs->ip = ip + sizeof(kprobe_opcode_t);
+ instruction_pointer_set(regs, ip + INT3_INSN_SIZE);
__this_cpu_write(current_kprobe, p);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
if (!p->pre_handler || !p->pre_handler(p, regs)) {
- /*
- * Emulate singlestep (and also recover regs->ip)
- * as if there is a 5byte nop
- */
- regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
if (unlikely(p->post_handler)) {
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ instruction_pointer_set(regs, ip + MCOUNT_INSN_SIZE);
kcb->kprobe_status = KPROBE_HIT_SSDONE;
p->post_handler(p, regs, 0);
}
- regs->ip = orig_ip;
+ /* Recover IP address */
+ instruction_pointer_set(regs, orig_ip);
}
/*
* If pre_handler returns !0, it changes regs->ip. We have to
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 517821b48391..36d6809c6c9e 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -324,7 +324,7 @@ static int can_optimize(unsigned long paddr)
* However, the kernel built with retpolines or IBT has jump
* tables disabled so the check can be skipped altogether.
*/
- if (!IS_ENABLED(CONFIG_RETPOLINE) &&
+ if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) &&
!IS_ENABLED(CONFIG_X86_KERNEL_IBT) &&
insn_is_indirect_jump(&insn))
return 0;
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index 257892fcefa7..b68d4be9464e 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj,
static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
memcpy(buf, (void *)&boot_params + off, count);
return count;
}
-static struct bin_attribute boot_params_data_attr = {
+static const struct bin_attribute boot_params_data_attr = {
.attr = {
.name = "data",
.mode = S_IRUGO,
},
- .read = boot_params_data_read,
+ .read_new = boot_params_data_read,
.size = sizeof(boot_params),
};
@@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = {
NULL,
};
-static struct bin_attribute *boot_params_data_attrs[] = {
+static const struct bin_attribute *const boot_params_data_attrs[] = {
&boot_params_data_attr,
NULL,
};
static const struct attribute_group boot_params_attr_group = {
.attrs = boot_params_version_attrs,
- .bin_attrs = boot_params_data_attrs,
+ .bin_attrs_new = boot_params_data_attrs,
};
static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
@@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj,
static ssize_t setup_data_data_read(struct file *fp,
struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf,
loff_t off, size_t count)
{
@@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = {
.name = "data",
.mode = S_IRUGO,
},
- .read = setup_data_data_read,
+ .read_new = setup_data_data_read,
};
static struct attribute *setup_data_type_attrs[] = {
@@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = {
NULL,
};
-static struct bin_attribute *setup_data_data_attrs[] = {
+static const struct bin_attribute *const setup_data_data_attrs[] = {
&data_attr,
NULL,
};
static const struct attribute_group setup_data_attr_group = {
.attrs = setup_data_type_attrs,
- .bin_attrs = setup_data_data_attrs,
+ .bin_attrs_new = setup_data_data_attrs,
};
static int __init create_setup_data_node(struct kobject *parent,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0ddb3bd0f1aa..7a422a6c5983 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -37,6 +37,7 @@
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
+#include <asm/mtrr.h>
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
#include <asm/ptrace.h>
@@ -44,7 +45,7 @@
#include <asm/svm.h>
#include <asm/e820/api.h>
-DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
+DEFINE_STATIC_KEY_FALSE_RO(kvm_async_pf_enabled);
static int kvmapf = 1;
@@ -65,6 +66,7 @@ static int __init parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
+static DEFINE_PER_CPU_READ_MOSTLY(bool, async_pf_enabled);
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
static int has_steal_clock = 0;
@@ -244,7 +246,7 @@ noinstr u32 kvm_read_and_reset_apf_flags(void)
{
u32 flags = 0;
- if (__this_cpu_read(apf_reason.enabled)) {
+ if (__this_cpu_read(async_pf_enabled)) {
flags = __this_cpu_read(apf_reason.flags);
__this_cpu_write(apf_reason.flags, 0);
}
@@ -295,7 +297,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
inc_irq_stat(irq_hv_callback_count);
- if (__this_cpu_read(apf_reason.enabled)) {
+ if (__this_cpu_read(async_pf_enabled)) {
token = __this_cpu_read(apf_reason.token);
kvm_async_pf_task_wake(token);
__this_cpu_write(apf_reason.token, 0);
@@ -362,7 +364,7 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
- __this_cpu_write(apf_reason.enabled, 1);
+ __this_cpu_write(async_pf_enabled, true);
pr_debug("setup async PF for cpu %d\n", smp_processor_id());
}
@@ -383,11 +385,11 @@ static void kvm_guest_cpu_init(void)
static void kvm_pv_disable_apf(void)
{
- if (!__this_cpu_read(apf_reason.enabled))
+ if (!__this_cpu_read(async_pf_enabled))
return;
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
- __this_cpu_write(apf_reason.enabled, 0);
+ __this_cpu_write(async_pf_enabled, false);
pr_debug("disable async PF for cpu %d\n", smp_processor_id());
}
@@ -434,7 +436,8 @@ static void __init sev_map_percpu_data(void)
{
int cpu;
- if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
+ if (cc_vendor != CC_VENDOR_AMD ||
+ !cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
for_each_possible_cpu(cpu) {
@@ -769,7 +772,7 @@ static struct notifier_block kvm_pv_reboot_nb = {
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written.
*/
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
static void kvm_crash_shutdown(struct pt_regs *regs)
{
kvm_guest_cpu_offline(true);
@@ -803,8 +806,8 @@ extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax)\n\t" \
"setne %al\n\t"
-DEFINE_PARAVIRT_ASM(__raw_callee_save___kvm_vcpu_is_preempted,
- PV_VCPU_PREEMPTED_ASM, .text);
+DEFINE_ASM_FUNC(__raw_callee_save___kvm_vcpu_is_preempted,
+ PV_VCPU_PREEMPTED_ASM, .text);
#endif
static void __init kvm_guest_init(void)
@@ -829,7 +832,7 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
static_branch_enable(&kvm_async_pf_enabled);
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
+ sysvec_install(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt);
}
#ifdef CONFIG_SMP
@@ -852,7 +855,7 @@ static void __init kvm_guest_init(void)
kvm_guest_cpu_init();
#endif
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
@@ -942,7 +945,7 @@ static void __init kvm_init_platform(void)
* Reset the host's shared pages list related to kernel
* specific page encryption status settings before we load a
* new kernel by kexec. Reset the page encryption status
- * during early boot intead of just before kexec to avoid SMP
+ * during early boot instead of just before kexec to avoid SMP
* races during kvm_pv_guest_cpu_reboot().
* NOTE: We cannot reset the complete shared pages list
* here as we need to retain the UEFI/OVMF firmware
@@ -978,6 +981,9 @@ static void __init kvm_init_platform(void)
}
kvmclock_init();
x86_platform.apic_post_init = kvm_apic_init;
+
+ /* Set WB as the default cache mode for SEV-SNP and TDX */
+ guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
}
#if defined(CONFIG_AMD_MEM_ENCRYPT)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index fb8f52149be9..5b2c15214a6b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,8 +24,8 @@
static int kvmclock __initdata = 1;
static int kvmclock_vsyscall __initdata = 1;
-static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
-static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
+static int msr_kvm_system_time __ro_after_init;
+static int msr_kvm_wall_clock __ro_after_init;
static u64 kvm_sched_clock_offset __ro_after_init;
static int __init parse_no_kvmclock(char *arg)
@@ -42,7 +42,7 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
}
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
+/* Aligned to page sizes to match what's mapped via vsyscalls to userspace */
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
@@ -154,15 +154,15 @@ static int kvm_cs_enable(struct clocksource *cs)
return 0;
}
-struct clocksource kvm_clock = {
+static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
+ .id = CSID_X86_KVM_CLK,
.enable = kvm_cs_enable,
};
-EXPORT_SYMBOL_GPL(kvm_clock);
static void kvm_register_clock(char *txt)
{
@@ -195,7 +195,8 @@ static void kvm_setup_secondary_clock(void)
void kvmclock_disable(void)
{
- native_write_msr(msr_kvm_system_time, 0, 0);
+ if (msr_kvm_system_time)
+ native_write_msr(msr_kvm_system_time, 0, 0);
}
static void __init kvmclock_init_mem(void)
@@ -294,7 +295,10 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
- } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ } else if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+ } else {
return;
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index adc67f98819a..0f19ef355f5f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,7 +7,7 @@
* This handles calls from both 32bit and 64bit mode.
*
* Lock order:
- * contex.ldt_usr_sem
+ * context.ldt_usr_sem
* mmap_lock
* context.lock
*/
@@ -49,7 +49,7 @@ void load_mm_ldt(struct mm_struct *mm)
/*
* Any change to mm->context.ldt is followed by an IPI to all
* CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
+ * after the IPI is handled by all such CPUs. This means that
* if the ldt_struct changes before we return, the values we see
* will be safe, and the new values will be loaded before we run
* any user code.
@@ -184,7 +184,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return new_ldt;
}
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
static void do_sanity_check(struct mm_struct *mm,
bool had_kernel_mapping,
@@ -377,7 +377,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false);
}
-#else /* !CONFIG_PAGE_TABLE_ISOLATION */
+#else /* !CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
@@ -388,11 +388,11 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt)
{
}
-#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* CONFIG_MITIGATION_PAGE_TABLE_ISOLATION */
static void free_ldt_pgtables(struct mm_struct *mm)
{
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = LDT_END_ADDR;
@@ -685,7 +685,7 @@ SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
}
/*
* The SYSCALL_DEFINE() macros give us an 'unsigned long'
- * return type, but tht ABI for sys_modify_ldt() expects
+ * return type, but the ABI for sys_modify_ldt() expects
* 'int'. This cast gives us an int-sized value in %rax
* for the return code. The 'unsigned' is necessary so
* the compiler does not try to sign-extend the negative
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 1b373d79cedc..80265162aeff 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -160,15 +160,10 @@ void machine_kexec_cleanup(struct kimage *image)
*/
void machine_kexec(struct kimage *image)
{
+ relocate_kernel_fn *relocate_kernel_ptr;
unsigned long page_list[PAGES_NR];
void *control_page;
int save_ftrace_enabled;
- asmlinkage unsigned long
- (*relocate_kernel_ptr)(unsigned long indirection_page,
- unsigned long control_page,
- unsigned long start_address,
- unsigned int has_pae,
- unsigned int preserve_context);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 1a3e2c05a8a5..a68f5a0a9f37 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/set_memory.h>
#include <asm/cpu.h>
+#include <asm/efi.h>
#ifdef CONFIG_ACPI
/*
@@ -42,12 +43,9 @@ struct init_pgtable_data {
static int mem_region_callback(struct resource *res, void *arg)
{
struct init_pgtable_data *data = arg;
- unsigned long mstart, mend;
-
- mstart = res->start;
- mend = mstart + resource_size(res) - 1;
- return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend);
+ return kernel_ident_mapping_init(data->info, data->level4p,
+ res->start, res->end + 1);
}
static int
@@ -90,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
{
#ifdef CONFIG_EFI
unsigned long mstart, mend;
+ void *kaddr;
+ int ret;
if (!efi_enabled(EFI_BOOT))
return 0;
@@ -105,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
if (!mstart)
return 0;
+ ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
+ if (ret)
+ return ret;
+
+ kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
+ if (!kaddr) {
+ pr_err("Could not map UEFI system table\n");
+ return -ENOMEM;
+ }
+
+ mstart = efi_config_table;
+
+ if (efi_enabled(EFI_64BIT)) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
+
+ mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
+ }
+
+ memunmap(kaddr);
+
return kernel_ident_mapping_init(info, level4p, mstart, mend);
#endif
return 0;
@@ -122,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image)
image->arch.pte = NULL;
}
-static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
+static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
+ unsigned long control_page)
{
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
unsigned long vaddr, paddr;
@@ -132,8 +157,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
pmd_t *pmd;
pte_t *pte;
- vaddr = (unsigned long)relocate_kernel;
- paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
+ /*
+ * For the transition to the identity mapped page tables, the control
+ * code page also needs to be mapped at the virtual address it starts
+ * off running from.
+ */
+ vaddr = (unsigned long)__va(control_page);
+ paddr = control_page;
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
@@ -192,7 +222,7 @@ static void *alloc_pgt_page(void *data)
return p;
}
-static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+static int init_pgtable(struct kimage *image, unsigned long control_page)
{
struct x86_mapping_info info = {
.alloc_pgt_page = alloc_pgt_page,
@@ -201,12 +231,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
.kernpg_flag = _KERNPG_TABLE_NOENC,
};
unsigned long mstart, mend;
- pgd_t *level4p;
int result;
int i;
- level4p = (pgd_t *)__va(start_pgtable);
- clear_page(level4p);
+ image->arch.pgd = alloc_pgt_page(image);
+ if (!image->arch.pgd)
+ return -ENOMEM;
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag |= _PAGE_ENC;
@@ -220,8 +250,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = pfn_mapped[i].start << PAGE_SHIFT;
mend = pfn_mapped[i].end << PAGE_SHIFT;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
}
@@ -236,8 +266,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
- result = kernel_ident_mapping_init(&info,
- level4p, mstart, mend);
+ result = kernel_ident_mapping_init(&info, image->arch.pgd,
+ mstart, mend);
if (result)
return result;
@@ -247,15 +277,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
* Prepare EFI systab and ACPI tables for kexec kernel since they are
* not covered by pfn_mapped.
*/
- result = map_efi_systab(&info, level4p);
+ result = map_efi_systab(&info, image->arch.pgd);
if (result)
return result;
- result = map_acpi_tables(&info, level4p);
+ result = map_acpi_tables(&info, image->arch.pgd);
if (result)
return result;
- return init_transition_pgtable(image, level4p);
+ /*
+ * This must be last because the intermediate page table pages it
+ * allocates will not be control pages and may overlap the image.
+ */
+ return init_transition_pgtable(image, image->arch.pgd, control_page);
}
static void load_segments(void)
@@ -272,22 +306,35 @@ static void load_segments(void)
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable;
+ void *control_page = page_address(image->control_code_page);
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
int result;
- /* Calculate the offsets */
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
-
/* Setup the identity mapped 64bit page table */
- result = init_pgtable(image, start_pgtable);
+ result = init_pgtable(image, __pa(control_page));
if (result)
return result;
+ kexec_va_control_page = (unsigned long)control_page;
+ kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd);
+
+ if (image->type == KEXEC_TYPE_DEFAULT)
+ kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
+
+ __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
+
+ set_memory_rox((unsigned long)control_page, 1);
return 0;
}
void machine_kexec_cleanup(struct kimage *image)
{
+ void *control_page = page_address(image->control_code_page);
+
+ set_memory_nx((unsigned long)control_page, 1);
+ set_memory_rw((unsigned long)control_page, 1);
+
free_transition_pgtable(image);
}
@@ -295,11 +342,19 @@ void machine_kexec_cleanup(struct kimage *image)
* Do not allocate memory (or fail in any way) in machine_kexec().
* We are past the point of no return, committed to rebooting now.
*/
-void machine_kexec(struct kimage *image)
+void __nocfi machine_kexec(struct kimage *image)
{
- unsigned long page_list[PAGES_NR];
- void *control_page;
+ unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
+ relocate_kernel_fn *relocate_kernel_ptr;
+ unsigned int host_mem_enc_active;
int save_ftrace_enabled;
+ void *control_page;
+
+ /*
+ * This must be done before load_segments() since if call depth tracking
+ * is used then GS must be valid to make any function calls.
+ */
+ host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -326,17 +381,13 @@ void machine_kexec(struct kimage *image)
#endif
}
- control_page = page_address(image->control_code_page) + PAGE_SIZE;
- __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
+ control_page = page_address(image->control_code_page);
- page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
- page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
- page_list[PA_TABLE_PAGE] =
- (unsigned long)__pa(page_address(image->control_code_page));
-
- if (image->type == KEXEC_TYPE_DEFAULT)
- page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
- << PAGE_SHIFT);
+ /*
+ * Allow for the possibility that relocate_kernel might not be at
+ * the very start of the page.
+ */
+ relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel - reloc_start;
/*
* The segment registers are funny things, they have both a
@@ -357,11 +408,11 @@ void machine_kexec(struct kimage *image)
native_gdt_invalidate();
/* now call it */
- image->start = relocate_kernel((unsigned long)image->head,
- (unsigned long)page_list,
- image->start,
- image->preserve_context,
- cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
+ image->start = relocate_kernel_ptr((unsigned long)image->head,
+ virt_to_phys(control_page),
+ image->start,
+ image->preserve_context,
+ host_mem_enc_active);
#ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -511,6 +562,8 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
}
#endif /* CONFIG_KEXEC_FILE */
+#ifdef CONFIG_CRASH_DUMP
+
static int
kexec_mark_range(unsigned long start, unsigned long end, bool protect)
{
@@ -540,8 +593,7 @@ static void kexec_mark_crashkres(bool protect)
/* Don't touch the control code page used in crash_kexec().*/
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
- /* Control code page is located in the 2nd page. */
- kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
+ kexec_mark_range(crashk_res.start, control - 1, protect);
control += KEXEC_CONTROL_PAGE_SIZE;
kexec_mark_range(control, crashk_res.end, protect);
}
@@ -555,6 +607,7 @@ void arch_kexec_unprotect_crashkres(void)
{
kexec_mark_crashkres(false);
}
+#endif
/*
* During a traditional boot under SME, SME will encrypt the kernel,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index c94dec6a1834..1f54eedc3015 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -9,6 +9,7 @@
#include <linux/pci.h>
#include <linux/dmi.h>
#include <linux/range.h>
+#include <linux/acpi.h>
#include <asm/pci-direct.h>
#include <linux/sort.h>
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 5f71a0cf4399..8984abd91c00 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -36,57 +36,6 @@ do { \
} while (0)
#endif
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
-
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
- if (kaslr_enabled()) {
- mutex_lock(&module_kaslr_mutex);
- /*
- * Calculate the module_load_offset the first time this
- * code is called. Once calculated it stays the same until
- * reboot.
- */
- if (module_load_offset == 0)
- module_load_offset =
- get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
- mutex_unlock(&module_kaslr_mutex);
- }
- return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
- return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
- gfp_t gfp_mask = GFP_KERNEL;
- void *p;
-
- if (PAGE_ALIGN(size) > MODULES_LEN)
- return NULL;
-
- p = __vmalloc_node_range(size, MODULE_ALIGN,
- MODULES_VADDR + get_module_load_offset(),
- MODULES_END, gfp_mask, PAGE_KERNEL,
- VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
- NUMA_NO_NODE, __builtin_return_address(0));
-
- if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
- vfree(p);
- return NULL;
- }
-
- return p;
-}
-
#ifdef CONFIG_X86_32
int apply_relocate(Elf32_Shdr *sechdrs,
const char *strtab,
@@ -197,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs,
}
if (apply) {
- if (memcmp(loc, &zero, size)) {
+ void *wr_loc = module_writable_address(me, loc);
+
+ if (memcmp(wr_loc, &zero, size)) {
pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
- write(loc, &val, size);
+ write(wr_loc, &val, size);
} else {
if (memcmp(loc, &val, size)) {
pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n",
(int)ELF64_R_TYPE(rel[i].r_info), loc, val);
return -ENOEXEC;
}
+ /* FIXME: needs care for ROX module allocations */
write(loc, &zero, size);
}
}
@@ -275,8 +227,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *alt = NULL, *locks = NULL,
- *para = NULL, *orc = NULL, *orc_ip = NULL,
+ const Elf_Shdr *s, *alt = NULL,
+ *orc = NULL, *orc_ip = NULL,
*retpolines = NULL, *returns = NULL, *ibt_endbr = NULL,
*calls = NULL, *cfi = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
@@ -284,10 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr,
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
if (!strcmp(".altinstructions", secstrings + s->sh_name))
alt = s;
- if (!strcmp(".smp_locks", secstrings + s->sh_name))
- locks = s;
- if (!strcmp(".parainstructions", secstrings + s->sh_name))
- para = s;
if (!strcmp(".orc_unwind", secstrings + s->sh_name))
orc = s;
if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name))
@@ -304,14 +252,6 @@ int module_finalize(const Elf_Ehdr *hdr,
ibt_endbr = s;
}
- /*
- * See alternative_instructions() for the ordering rules between the
- * various patching types.
- */
- if (para) {
- void *pseg = (void *)para->sh_addr;
- apply_paravirt(pseg, pseg + para->sh_size);
- }
if (retpolines || cfi) {
void *rseg = NULL, *cseg = NULL;
unsigned int rsize = 0, csize = 0;
@@ -326,22 +266,22 @@ int module_finalize(const Elf_Ehdr *hdr,
csize = cfi->sh_size;
}
- apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize);
+ apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me);
}
if (retpolines) {
void *rseg = (void *)retpolines->sh_addr;
- apply_retpolines(rseg, rseg + retpolines->sh_size);
+ apply_retpolines(rseg, rseg + retpolines->sh_size, me);
}
if (returns) {
void *rseg = (void *)returns->sh_addr;
- apply_returns(rseg, rseg + returns->sh_size);
+ apply_returns(rseg, rseg + returns->sh_size, me);
}
if (alt) {
/* patch .altinstructions */
void *aseg = (void *)alt->sh_addr;
- apply_alternatives(aseg, aseg + alt->sh_size);
+ apply_alternatives(aseg, aseg + alt->sh_size, me);
}
- if (calls || para) {
+ if (calls || alt) {
struct callthunk_sites cs = {};
if (calls) {
@@ -349,17 +289,37 @@ int module_finalize(const Elf_Ehdr *hdr,
cs.call_end = (void *)calls->sh_addr + calls->sh_size;
}
- if (para) {
- cs.pv_start = (void *)para->sh_addr;
- cs.pv_end = (void *)para->sh_addr + para->sh_size;
+ if (alt) {
+ cs.alt_start = (void *)alt->sh_addr;
+ cs.alt_end = (void *)alt->sh_addr + alt->sh_size;
}
callthunks_patch_module_calls(&cs, me);
}
if (ibt_endbr) {
void *iseg = (void *)ibt_endbr->sh_addr;
- apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size);
+ apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me);
}
+
+ if (orc && orc_ip)
+ unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
+ (void *)orc->sh_addr, orc->sh_size);
+
+ return 0;
+}
+
+int module_post_finalize(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ struct module *me)
+{
+ const Elf_Shdr *s, *locks = NULL;
+ char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks = s;
+ }
+
if (locks) {
void *lseg = (void *)locks->sh_addr;
void *text = me->mem[MOD_TEXT].base;
@@ -369,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr,
text, text_end);
}
- if (orc && orc_ip)
- unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
- (void *)orc->sh_addr, orc->sh_size);
-
return 0;
}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b223922248e9..4a1b1b28abf9 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -36,6 +36,8 @@
* Checksum an MP configuration block.
*/
+static unsigned int num_procs __initdata;
+
static int __init mpf_checksum(unsigned char *mp, int len)
{
int sum = 0;
@@ -50,16 +52,15 @@ static void __init MP_processor_info(struct mpc_cpu *m)
{
char *bootup_cpu = "";
- if (!(m->cpuflag & CPU_ENABLED)) {
- disabled_cpus++;
+ topology_register_apic(m->apicid, CPU_ACPIID_INVALID, m->cpuflag & CPU_ENABLED);
+ if (!(m->cpuflag & CPU_ENABLED))
return;
- }
if (m->cpuflag & CPU_BOOTPROCESSOR)
bootup_cpu = " (Bootup-CPU)";
pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
- generic_processor_info(m->apicid);
+ num_procs++;
}
#ifdef CONFIG_X86_IO_APIC
@@ -67,7 +68,7 @@ static void __init mpc_oem_bus_info(struct mpc_bus *m, char *str)
{
memcpy(str, m->bustype, 6);
str[6] = 0;
- apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
+ apic_pr_verbose("Bus #%d is %s\n", m->busid, str);
}
static void __init MP_bus_info(struct mpc_bus *m)
@@ -196,12 +197,12 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
- /* Initialize the lapic mapping */
- if (!acpi_lapic)
- register_lapic_address(mpc->lapic);
-
- if (early)
+ if (early) {
+ /* Initialize the lapic mapping */
+ if (!acpi_lapic)
+ register_lapic_address(mpc->lapic);
return 1;
+ }
/* Now process the configuration blocks. */
while (count < mpc->length) {
@@ -236,9 +237,9 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
}
}
- if (!num_processors)
+ if (!num_procs && !acpi_lapic)
pr_err("MPTABLE: no processors registered!\n");
- return num_processors;
+ return num_procs || acpi_lapic;
}
#ifdef CONFIG_X86_IO_APIC
@@ -416,7 +417,7 @@ static unsigned long __init get_mpc_size(unsigned long physptr)
mpc = early_memremap(physptr, PAGE_SIZE);
size = mpc->length;
early_memunmap(mpc, PAGE_SIZE);
- apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size);
+ apic_pr_verbose(" mpc: %lx-%lx\n", physptr, physptr + size);
return size;
}
@@ -473,7 +474,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
/*
* Scan the memory blocks for an SMP configuration block.
*/
-void __init default_get_smp_config(unsigned int early)
+static __init void mpparse_get_smp_config(unsigned int early)
{
struct mpf_intel *mpf;
@@ -529,8 +530,8 @@ void __init default_get_smp_config(unsigned int early)
} else
BUG();
- if (!early)
- pr_info("Processors: %d\n", num_processors);
+ if (!early && !acpi_lapic)
+ pr_info("Processors: %d\n", num_procs);
/*
* Only use the first configuration found.
*/
@@ -538,6 +539,16 @@ out:
early_memunmap(mpf, sizeof(*mpf));
}
+void __init mpparse_parse_early_smp_config(void)
+{
+ mpparse_get_smp_config(true);
+}
+
+void __init mpparse_parse_smp_config(void)
+{
+ mpparse_get_smp_config(false);
+}
+
static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
@@ -549,8 +560,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
struct mpf_intel *mpf;
int ret = 0;
- apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n",
- base, base + length - 1);
+ apic_pr_verbose("Scan for SMP in [mem %#010lx-%#010lx]\n", base, base + length - 1);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -587,7 +597,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
return ret;
}
-void __init default_find_smp_config(void)
+void __init mpparse_find_mptable(void)
{
unsigned int address;
@@ -672,13 +682,13 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
{
int i;
- apic_printk(APIC_VERBOSE, "OLD ");
+ apic_pr_verbose("OLD ");
print_mp_irq_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
memcpy(m, &mp_irqs[i], sizeof(*m));
- apic_printk(APIC_VERBOSE, "NEW ");
+ apic_pr_verbose("NEW ");
print_mp_irq_info(&mp_irqs[i]);
return;
}
@@ -761,7 +771,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
continue;
if (nr_m_spare > 0) {
- apic_printk(APIC_VERBOSE, "*NEW* found\n");
+ apic_pr_verbose("*NEW* found\n");
nr_m_spare--;
memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
m_spare[nr_m_spare] = NULL;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 17e955ab69fe..ed163c8c8604 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -35,6 +35,7 @@
#include <asm/nospec-branch.h>
#include <asm/microcode.h>
#include <asm/sev.h>
+#include <asm/fred.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -303,13 +304,13 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
__this_cpu_add(nmi_stats.unknown, 1);
- pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg_ratelimited("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
if (unknown_nmi_panic || panic_on_unrecovered_nmi)
nmi_panic(regs, "NMI: Not continuing");
- pr_emerg("Dazed and confused, but trying to continue\n");
+ pr_emerg_ratelimited("Dazed and confused, but trying to continue\n");
}
NOKPROBE_SYMBOL(unknown_nmi_error);
@@ -502,7 +503,7 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
raw_atomic_long_inc(&nsp->idt_calls);
- if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
+ if (arch_cpu_is_offline(smp_processor_id())) {
if (microcode_nmi_handler_enabled())
microcode_offline_nmi_handler();
return;
@@ -563,9 +564,6 @@ nmi_restart:
}
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
-
- if (user_mode(regs))
- mds_user_clear_cpu_buffers();
}
#if IS_ENABLED(CONFIG_KVM_INTEL)
@@ -582,7 +580,7 @@ EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
static char *nmi_check_stall_msg[] = {
/* */
-/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
+/* +--------- nmi_seq & 0x1: CPU is currently in NMI handler. */
/* | +------ cpu_is_offline(cpu) */
/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
/* | | | NMI handler has been invoked. */
@@ -630,27 +628,72 @@ void nmi_backtrace_stall_check(const struct cpumask *btp)
nmi_seq = READ_ONCE(nsp->idt_nmi_seq);
if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) {
msgp = "CPU entered NMI handler function, but has not exited";
- } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) {
- msgp = "CPU is handling NMIs";
- } else {
- idx = ((nsp->idt_seq_snap & 0x1) << 2) |
+ } else if (nsp->idt_nmi_seq_snap == nmi_seq ||
+ nsp->idt_nmi_seq_snap + 1 == nmi_seq) {
+ idx = ((nmi_seq & 0x1) << 2) |
(cpu_is_offline(cpu) << 1) |
(nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls));
msgp = nmi_check_stall_msg[idx];
if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1))
modp = ", but OK because ignore_nmis was set";
- if (nmi_seq & ~0x1)
- msghp = " (CPU currently in NMI handler function)";
- else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
+ if (nsp->idt_nmi_seq_snap + 1 == nmi_seq)
msghp = " (CPU exited one NMI handler function)";
+ else if (nmi_seq & 0x1)
+ msghp = " (CPU currently in NMI handler function)";
+ else
+ msghp = " (CPU was never in an NMI handler function)";
+ } else {
+ msgp = "CPU is handling NMIs";
}
- pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n",
- __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies));
+ pr_alert("%s: CPU %d: %s%s%s\n", __func__, cpu, msgp, modp, msghp);
+ pr_alert("%s: last activity: %lu jiffies ago.\n",
+ __func__, j - READ_ONCE(nsp->recv_jiffies));
}
}
#endif
+#ifdef CONFIG_X86_FRED
+/*
+ * With FRED, CR2/DR6 is pushed to #PF/#DB stack frame during FRED
+ * event delivery, i.e., there is no problem of transient states.
+ * And NMI unblocking only happens when the stack frame indicates
+ * that so should happen.
+ *
+ * Thus, the NMI entry stub for FRED is really straightforward and
+ * as simple as most exception handlers. As such, #DB is allowed
+ * during NMI handling.
+ */
+DEFINE_FREDENTRY_NMI(exc_nmi)
+{
+ irqentry_state_t irq_state;
+
+ if (arch_cpu_is_offline(smp_processor_id())) {
+ if (microcode_nmi_handler_enabled())
+ microcode_offline_nmi_handler();
+ return;
+ }
+
+ /*
+ * Save CR2 for eventual restore to cover the case where the NMI
+ * hits the VMENTER/VMEXIT region where guest CR2 is life. This
+ * prevents guest state corruption in case that the NMI handler
+ * takes a page fault.
+ */
+ this_cpu_write(nmi_cr2, read_cr2());
+
+ irq_state = irqentry_nmi_enter(regs);
+
+ inc_irq_stat(__nmi_count);
+ default_do_nmi(regs);
+
+ irqentry_nmi_exit(regs, irq_state);
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+}
+#endif
+
void stop_nmi(void)
{
ignore_nmis++;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 97f1436c1a20..1ccaa3397a67 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -34,14 +34,8 @@
#include <asm/io_bitmap.h>
#include <asm/gsseg.h>
-/*
- * nop stub, which must not clobber anything *including the stack* to
- * avoid confusing the entry prologues.
- */
-DEFINE_PARAVIRT_ASM(_paravirt_nop, "", .entry.text);
-
/* stub always returning 0. */
-DEFINE_PARAVIRT_ASM(paravirt_ret0, "xor %eax,%eax", .entry.text);
+DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text);
void __init default_banner(void)
{
@@ -49,63 +43,36 @@ void __init default_banner(void)
pv_info.name);
}
-/* Undefined instruction for dealing with missing ops pointers. */
-noinstr void paravirt_BUG(void)
-{
- BUG();
-}
-
-static unsigned paravirt_patch_call(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- __text_gen_insn(insn_buff, CALL_INSN_OPCODE,
- (void *)addr, target, CALL_INSN_SIZE);
- return CALL_INSN_SIZE;
-}
-
#ifdef CONFIG_PARAVIRT_XXL
-DEFINE_PARAVIRT_ASM(_paravirt_ident_64, "mov %rdi, %rax", .text);
-DEFINE_PARAVIRT_ASM(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_disable, "cli", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_irq_enable, "sti", .noinstr.text);
-DEFINE_PARAVIRT_ASM(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
+DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text);
+DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text);
+DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text);
#endif
-DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
+DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key);
void __init native_pv_lock_init(void)
{
- if (IS_ENABLED(CONFIG_PARAVIRT_SPINLOCKS) &&
- !boot_cpu_has(X86_FEATURE_HYPERVISOR))
- static_branch_disable(&virt_spin_lock_key);
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ static_branch_enable(&virt_spin_lock_key);
}
+#ifndef CONFIG_PT_RECLAIM
static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
- tlb_remove_page(tlb, table);
-}
+ struct ptdesc *ptdesc = (struct ptdesc *)table;
-unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
+ pagetable_dtor(ptdesc);
+ tlb_remove_page(tlb, ptdesc_page(ptdesc));
+}
+#else
+static void native_tlb_remove_table(struct mmu_gather *tlb, void *table)
{
- /*
- * Neat trick to map patch type back to the call within the
- * corresponding structure.
- */
- void *opfunc = *((void **)&pv_ops + type);
- unsigned ret;
-
- if (opfunc == NULL)
- /* If there's no function, patch it with paravirt_BUG() */
- ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
- else if (opfunc == _paravirt_nop)
- ret = 0;
- else
- /* Otherwise call the function. */
- ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
-
- return ret;
+ tlb_remove_table(tlb, table);
}
+#endif
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -159,11 +126,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
native_set_debugreg(regno, val);
}
-noinstr void pv_native_wbinvd(void)
-{
- native_wbinvd();
-}
-
static noinstr void pv_native_safe_halt(void)
{
native_safe_halt();
@@ -191,7 +153,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.read_cr0 = native_read_cr0,
.cpu.write_cr0 = native_write_cr0,
.cpu.write_cr4 = native_write_cr4,
- .cpu.wbinvd = pv_native_wbinvd,
.cpu.read_msr = native_read_msr,
.cpu.write_msr = native_write_msr,
.cpu.read_msr_safe = native_read_msr_safe,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f323d83e40a7..6267363e0189 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void)
swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
}
-/*
- * See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel
- * parameter documentation.
- */
static __init int iommu_setup(char *p)
{
iommu_merge = 1;
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 319fef37d9dc..cc2c34ba7228 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -203,16 +203,6 @@ void __init probe_roms(void)
unsigned char c;
int i;
- /*
- * The ROM memory range is not part of the e820 table and is therefore not
- * pre-validated by BIOS. The kernel page table maps the ROM region as encrypted
- * memory, and SNP requires encrypted memory to be validated before access.
- * Do that here.
- */
- snp_prep_memory(video_rom_resource.start,
- ((system_rom_resource.end + 1) - video_rom_resource.start),
- SNP_PAGE_STATE_PRIVATE);
-
/* video rom */
upper = adapter_rom_resources[0].start;
for (start = video_rom_resource.start; start < upper; start += 2048) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b6f4e8399fca..6da6769d7254 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -30,6 +30,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/entry-common.h>
#include <asm/cpu.h>
+#include <asm/cpuid.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
#include <asm/mwait.h>
@@ -477,7 +478,7 @@ void native_tss_update_io_bitmap(void)
/*
* Make sure that the TSS limit is covering the IO bitmap. It might have
* been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
- * access from user space to trigger a #GP because tbe bitmap is outside
+ * access from user space to trigger a #GP because the bitmap is outside
* the TSS limit.
*/
refresh_tss_limit();
@@ -825,7 +826,7 @@ void __noreturn stop_this_cpu(void *dummy)
* X86_FEATURE_SME due to cmdline options.
*/
if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
- native_wbinvd();
+ wbinvd();
/*
* This brings a cache line back and dirties it, but
@@ -835,42 +836,24 @@ void __noreturn stop_this_cpu(void *dummy)
*/
cpumask_clear_cpu(cpu, &cpus_stop_mask);
+#ifdef CONFIG_SMP
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+#endif
+
for (;;) {
/*
* Use native_halt() so that memory contents don't change
* (stack usage and variables) after possibly issuing the
- * native_wbinvd() above.
+ * wbinvd() above.
*/
native_halt();
}
}
/*
- * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
- * states (local apic timer and TSC stop).
- *
- * XXX this function is completely buggered vs RCU and tracing.
- */
-static void amd_e400_idle(void)
-{
- /*
- * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
- * gets set after static_cpu_has() places have been converted via
- * alternatives.
- */
- if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
- default_idle();
- return;
- }
-
- tick_broadcast_enter();
-
- default_idle();
-
- tick_broadcast_exit();
-}
-
-/*
* Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
* exists and whenever MONITOR/MWAIT extensions are present there is at
* least one C1 substate.
@@ -878,36 +861,37 @@ static void amd_e400_idle(void)
* Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
* is passed to kernel commandline parameter.
*/
-static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+static __init bool prefer_mwait_c1_over_halt(void)
{
+ const struct cpuinfo_x86 *c = &boot_cpu_data;
u32 eax, ebx, ecx, edx;
- /* User has disallowed the use of MWAIT. Fallback to HALT */
- if (boot_option_idle_override == IDLE_NOMWAIT)
- return 0;
+ /* If override is enforced on the command line, fall back to HALT. */
+ if (boot_option_idle_override != IDLE_NO_OVERRIDE)
+ return false;
/* MWAIT is not supported on this platform. Fallback to HALT */
if (!cpu_has(c, X86_FEATURE_MWAIT))
- return 0;
+ return false;
- /* Monitor has a bug. Fallback to HALT */
- if (boot_cpu_has_bug(X86_BUG_MONITOR))
- return 0;
+ /* Monitor has a bug or APIC stops in C1E. Fallback to HALT */
+ if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
+ return false;
- cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
/*
* If MWAIT extensions are not available, it is safe to use MWAIT
* with EAX=0, ECX=0.
*/
if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
- return 1;
+ return true;
/*
* If MWAIT extensions are available, there should be at least one
* MWAIT C1 substate present.
*/
- return (edx & MWAIT_C1_SUBSTATE_MASK);
+ return !!(edx & MWAIT_C1_SUBSTATE_MASK);
}
/*
@@ -933,26 +917,27 @@ static __cpuidle void mwait_idle(void)
__current_clr_polling();
}
-void select_idle_routine(const struct cpuinfo_x86 *c)
+void __init select_idle_routine(void)
{
-#ifdef CONFIG_SMP
- if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
- pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
-#endif
- if (x86_idle_set() || boot_option_idle_override == IDLE_POLL)
+ if (boot_option_idle_override == IDLE_POLL) {
+ if (IS_ENABLED(CONFIG_SMP) && __max_threads_per_core > 1)
+ pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
return;
+ }
- if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
- pr_info("using AMD E400 aware idle routine\n");
- static_call_update(x86_idle, amd_e400_idle);
- } else if (prefer_mwait_c1_over_halt(c)) {
+ /* Required to guard against xen_set_default_idle() */
+ if (x86_idle_set())
+ return;
+
+ if (prefer_mwait_c1_over_halt()) {
pr_info("using mwait in idle threads\n");
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
static_call_update(x86_idle, tdx_safe_halt);
- } else
+ } else {
static_call_update(x86_idle, default_idle);
+ }
}
void amd_e400_c1e_apic_setup(void)
@@ -985,7 +970,10 @@ void __init arch_post_acpi_subsys_init(void)
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
- pr_info("System has AMD C1E enabled\n");
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE))
+ static_branch_enable(&arch_needs_tick_broadcast);
+ pr_info("System has AMD C1E erratum E400. Workaround enabled.\n");
}
static int __init idle_setup(char *str)
@@ -998,24 +986,14 @@ static int __init idle_setup(char *str)
boot_option_idle_override = IDLE_POLL;
cpu_idle_poll_ctrl(true);
} else if (!strcmp(str, "halt")) {
- /*
- * When the boot option of idle=halt is added, halt is
- * forced to be used for CPU idle. In such case CPU C2/C3
- * won't be used again.
- * To continue to load the CPU idle driver, don't touch
- * the boot_option_idle_override.
- */
- static_call_update(x86_idle, default_idle);
+ /* 'idle=halt' HALT for idle. C-states are disabled. */
boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
- /*
- * If the boot option of "idle=nomwait" is added,
- * it means that mwait will be disabled for CPU C1/C2/C3
- * states.
- */
+ /* 'idle=nomwait' disables MWAIT for idle */
boot_option_idle_override = IDLE_NOMWAIT;
- } else
- return -1;
+ } else {
+ return -EINVAL;
+ }
return 0;
}
@@ -1030,7 +1008,10 @@ unsigned long arch_align_stack(unsigned long sp)
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- return randomize_page(mm->brk, 0x02000000);
+ if (mmap_is_ia32())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 708c87b88cc1..0917c7f25720 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -156,13 +156,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
int cpu = smp_processor_id();
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
+ switch_fpu_prepare(prev_p, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -209,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
raw_cpu_write(pcpu_hot.current_task, next_p);
- switch_fpu_finish();
+ switch_fpu_finish(next_p);
/* Load the Intel cache allocation PQR MSR. */
resctrl_sched_in(next_p);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 33b268747bb7..226472332a70 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,6 +56,7 @@
#include <asm/resctrl.h>
#include <asm/unistd.h>
#include <asm/fsgsbase.h>
+#include <asm/fred.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
@@ -117,7 +118,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
log_lvl, fs, fsindex, gs, gsindex, shadowgs);
- printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n",
+ printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n",
log_lvl, regs->cs, ds, es, cr0);
printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
log_lvl, cr2, cr3, cr4);
@@ -138,7 +139,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
log_lvl, d3, d6, d7);
}
- if (cpu_feature_enabled(X86_FEATURE_OSPKE))
+ if (cr4 & X86_CR4_PKE)
printk("%sPKRU: %08x\n", log_lvl, read_pkru());
}
@@ -166,7 +167,29 @@ static noinstr unsigned long __rdgsbase_inactive(void)
lockdep_assert_irqs_disabled();
- if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ /*
+ * SWAPGS is no longer needed thus NOT allowed with FRED because
+ * FRED transitions ensure that an operating system can _always_
+ * operate with its own GS base address:
+ * - For events that occur in ring 3, FRED event delivery swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ * - ERETU (the FRED transition that returns to ring 3) also swaps
+ * the GS base address with the IA32_KERNEL_GS_BASE MSR.
+ *
+ * And the operating system can still setup the GS segment for a
+ * user thread without the need of loading a user thread GS with:
+ * - Using LKGS, available with FRED, to modify other attributes
+ * of the GS segment without compromising its ability always to
+ * operate with its own GS base address.
+ * - Accessing the GS segment base address for a user thread as
+ * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR.
+ *
+ * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE
+ * MSR instead of the GS segment’s descriptor cache. As such, the
+ * operating system never changes its runtime GS base address.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
gsbase = rdgsbase();
native_swapgs();
@@ -191,7 +214,8 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase)
{
lockdep_assert_irqs_disabled();
- if (!cpu_feature_enabled(X86_FEATURE_XENPV)) {
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ !cpu_feature_enabled(X86_FEATURE_XENPV)) {
native_swapgs();
wrgsbase(gsbase);
native_swapgs();
@@ -505,7 +529,7 @@ void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
static void
start_thread_common(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp,
- unsigned int _cs, unsigned int _ss, unsigned int _ds)
+ u16 _cs, u16 _ss, u16 _ds)
{
WARN_ON_ONCE(regs != current_pt_regs());
@@ -522,11 +546,36 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(ds, _ds);
load_gs_index(0);
- regs->ip = new_ip;
- regs->sp = new_sp;
- regs->cs = _cs;
- regs->ss = _ss;
- regs->flags = X86_EFLAGS_IF;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ regs->csx = _cs;
+ regs->ssx = _ss;
+ /*
+ * Allow single-step trap and NMI when starting a new task, thus
+ * once the new task enters user space, single-step trap and NMI
+ * are both enabled immediately.
+ *
+ * Entering a new task is logically speaking a return from a
+ * system call (exec, fork, clone, etc.). As such, if ptrace
+ * enables single stepping a single step exception should be
+ * allowed to trigger immediately upon entering user space.
+ * This is not optional.
+ *
+ * NMI should *never* be disabled in user space. As such, this
+ * is an optional, opportunistic way to catch errors.
+ *
+ * Paranoia: High-order 48 bits above the lowest 16 bit SS are
+ * discarded by the legacy IRET instruction on all Intel, AMD,
+ * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally,
+ * even when FRED is not enabled. But we choose the safer side
+ * to use these bits only when FRED is enabled.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ regs->fred_ss.swevent = true;
+ regs->fred_ss.nmi = true;
+ }
+
+ regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
}
void
@@ -562,14 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
- struct fpu *prev_fpu = &prev->fpu;
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(pcpu_hot.hardirq_stack_inuse));
- if (!test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_prepare(prev_fpu, cpu);
+ if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD))
+ switch_fpu_prepare(prev_p, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -623,7 +671,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
raw_cpu_write(pcpu_hot.current_task, next_p);
raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p));
- switch_fpu_finish();
+ switch_fpu_finish(next_p);
/* Reload sp0. */
update_task_stack(next_p);
@@ -750,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
#define LAM_U57_BITS 6
+static void enable_lam_func(void *__mm)
+{
+ struct mm_struct *mm = __mm;
+ unsigned long lam;
+
+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
+ lam = mm_lam_cr3_mask(mm);
+ write_cr3(__read_cr3() | lam);
+ cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
+ }
+}
+
+static void mm_enable_lam(struct mm_struct *mm)
+{
+ mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
+ mm->context.untag_mask = ~GENMASK(62, 57);
+
+ /*
+ * Even though the process must still be single-threaded at this
+ * point, kernel threads may be using the mm. IPI those kernel
+ * threads if they exist.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
+ set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+}
+
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
{
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@@ -766,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
if (mmap_write_lock_killable(mm))
return -EINTR;
+ /*
+ * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
+ * being enabled unless the process is single threaded:
+ */
if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
mmap_write_unlock(mm);
return -EBUSY;
}
- if (!nr_bits) {
- mmap_write_unlock(mm);
- return -EINVAL;
- } else if (nr_bits <= LAM_U57_BITS) {
- mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
- mm->context.untag_mask = ~GENMASK(62, 57);
- } else {
+ if (!nr_bits || nr_bits > LAM_U57_BITS) {
mmap_write_unlock(mm);
return -EINVAL;
}
- write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
- set_tlbstate_lam_mode(mm);
- set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
+ mm_enable_lam(mm);
mmap_write_unlock(mm);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 830425e6d38e..dc1dd3f3e67f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <linux/delay.h>
#include <linux/objtool.h>
#include <linux/pgtable.h>
+#include <linux/kexec.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -529,7 +530,7 @@ static inline void kb_wait(void)
static inline void nmi_shootdown_cpus_on_restart(void);
-#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
+#if IS_ENABLED(CONFIG_KVM_X86)
/* RCU-protected callback to disable virtualization prior to reboot. */
static cpu_emergency_virt_cb __rcu *cpu_emergency_virt_callback;
@@ -599,7 +600,7 @@ static void emergency_reboot_disable_virtualization(void)
}
#else
static void emergency_reboot_disable_virtualization(void) { }
-#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
+#endif /* CONFIG_KVM_X86 */
void __attribute__((weak)) mach_reboot_fixups(void)
{
@@ -716,6 +717,14 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
+ /*
+ * Call enc_kexec_begin() while all CPUs are still active and
+ * interrupts are enabled. This will allow all in-flight memory
+ * conversions to finish cleanly.
+ */
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_begin();
+
/* Stop the cpus and apics */
#ifdef CONFIG_X86_IO_APIC
/*
@@ -752,6 +761,9 @@ void native_machine_shutdown(void)
#ifdef CONFIG_X86_64
x86_platform.iommu_shutdown();
#endif
+
+ if (kexec_in_progress)
+ x86_platform.guest.enc_kexec_finish();
}
static void __machine_emergency_restart(int emergency)
@@ -796,7 +808,7 @@ struct machine_ops machine_ops __ro_after_init = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -826,7 +838,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
@@ -868,6 +880,12 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
+
+ if (smp_ops.stop_this_cpu) {
+ smp_ops.stop_this_cpu();
+ BUG();
+ }
+
/* Assume hlt works */
halt();
for (;;)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 56cab1bb25f5..b44d8863e57f 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -5,12 +5,15 @@
*/
#include <linux/linkage.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
#include <asm/page_types.h>
#include <asm/kexec.h>
#include <asm/processor-flags.h>
#include <asm/pgtable_types.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
+#include <asm/asm-offsets.h>
/*
* Must be relocatable PIC code callable as a C function, in particular
@@ -21,33 +24,30 @@
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
/*
- * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
- * ~ control_page + PAGE_SIZE are used as data storage and stack for
- * jumping back
+ * The .text..relocate_kernel and .data..relocate_kernel sections are copied
+ * into the control page, and the remainder of the page is used as the stack.
*/
-#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+ .section .data..relocate_kernel,"a";
/* Minimal CPU state */
-#define RSP DATA(0x0)
-#define CR0 DATA(0x8)
-#define CR3 DATA(0x10)
-#define CR4 DATA(0x18)
-
-/* other data */
-#define CP_PA_TABLE_PAGE DATA(0x20)
-#define CP_PA_SWAP_PAGE DATA(0x28)
-#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
-
- .text
- .align PAGE_SIZE
+SYM_DATA_LOCAL(saved_rsp, .quad 0)
+SYM_DATA_LOCAL(saved_cr0, .quad 0)
+SYM_DATA_LOCAL(saved_cr3, .quad 0)
+SYM_DATA_LOCAL(saved_cr4, .quad 0)
+ /* other data */
+SYM_DATA(kexec_va_control_page, .quad 0)
+SYM_DATA(kexec_pa_table_page, .quad 0)
+SYM_DATA(kexec_pa_swap_page, .quad 0)
+SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
+
+ .section .text..relocate_kernel,"ax";
.code64
-SYM_CODE_START_NOALIGN(relocate_range)
SYM_CODE_START_NOALIGN(relocate_kernel)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/*
* %rdi indirection_page
- * %rsi page_list
+ * %rsi pa_control_page
* %rdx start address
* %rcx preserve_context
* %r8 host_mem_enc_active
@@ -62,60 +62,57 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
pushq %r15
pushf
- movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
- movq %rsp, RSP(%r11)
- movq %cr0, %rax
- movq %rax, CR0(%r11)
- movq %cr3, %rax
- movq %rax, CR3(%r11)
- movq %cr4, %rax
- movq %rax, CR4(%r11)
-
- /* Save CR4. Required to enable the right paging mode later. */
- movq %rax, %r13
-
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* Save SME active flag */
- movq %r8, %r12
+ /* Switch to the identity mapped page tables */
+ movq %cr3, %rax
+ movq kexec_pa_table_page(%rip), %r9
+ movq %r9, %cr3
- /*
- * get physical address of control page now
- * this is impossible after page table switch
- */
- movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ /* Leave CR4 in %r13 to enable the right paging mode later. */
+ movq %cr4, %r13
- /* get physical address of page table now too */
- movq PTR(PA_TABLE_PAGE)(%rsi), %r9
+ /* Disable global pages immediately to ensure this mapping is RWX */
+ movq %r13, %r12
+ andq $~(X86_CR4_PGE), %r12
+ movq %r12, %cr4
- /* get physical address of swap page now */
- movq PTR(PA_SWAP_PAGE)(%rsi), %r10
+ /* Save %rsp and CRs. */
+ movq %r13, saved_cr4(%rip)
+ movq %rsp, saved_rsp(%rip)
+ movq %rax, saved_cr3(%rip)
+ movq %cr0, %rax
+ movq %rax, saved_cr0(%rip)
- /* save some information for jumping back */
- movq %r9, CP_PA_TABLE_PAGE(%r11)
- movq %r10, CP_PA_SWAP_PAGE(%r11)
- movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
+ /* save indirection list for jumping back */
+ movq %rdi, pa_backup_pages_map(%rip)
- /* Switch to the identity mapped page tables */
- movq %r9, %cr3
+ /* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
+ movq %rcx, %r11
/* setup a new stack at the end of the physical control page */
- lea PAGE_SIZE(%r8), %rsp
+ lea PAGE_SIZE(%rsi), %rsp
/* jump to identity mapped page */
- addq $(identity_mapped - relocate_kernel), %r8
- pushq %r8
- ANNOTATE_UNRET_SAFE
- ret
- int3
+0: addq $identity_mapped - 0b, %rsi
+ subq $__relocate_kernel_start - 0b, %rsi
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rsi
SYM_CODE_END(relocate_kernel)
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
UNWIND_HINT_END_OF_STACK
- /* set return address to 0 if not preserving context */
- pushq $0
+ /*
+ * %rdi indirection page
+ * %rdx start address
+ * %r8 host_mem_enc_active
+ * %r9 page table page
+ * %r11 preserve_context
+ * %r13 original CR4 when relocate_kernel() was invoked
+ */
+
/* store the start address on the stack */
pushq %rdx
@@ -145,16 +142,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* Set cr4 to a known state:
* - physical address extension enabled
* - 5-level paging, if it was enabled before
+ * - Machine check exception on TDX guest, if it was enabled before.
+ * Clearing MCE might not be allowed in TDX guests, depending on setup.
+ *
+ * Use R13 that contains the original CR4 value, read in relocate_kernel().
+ * PAE is always set in the original CR4.
*/
- movl $X86_CR4_PAE, %eax
- testq $X86_CR4_LA57, %r13
- jz 1f
- orl $X86_CR4_LA57, %eax
-1:
- movq %rax, %cr4
-
- jmp 1f
-1:
+ andl $(X86_CR4_PAE | X86_CR4_LA57), %r13d
+ ALTERNATIVE "", __stringify(orl $X86_CR4_MCE, %r13d), X86_FEATURE_TDX_GUEST
+ movq %r13, %cr4
/* Flush the TLB (needed?) */
movq %r9, %cr3
@@ -164,12 +160,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* entries that will conflict with the now unencrypted memory
* used by kexec. Flush the caches before copying the kernel.
*/
- testq %r12, %r12
- jz 1f
+ testq %r8, %r8
+ jz .Lsme_off
wbinvd
-1:
+.Lsme_off:
- movq %rcx, %r11
call swap_pages
/*
@@ -181,13 +176,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
movq %cr3, %rax
movq %rax, %cr3
+ testq %r11, %r11 /* preserve_context */
+ jnz .Lrelocate
+
/*
* set all of the registers to known values
* leave %rsp alone
*/
- testq %r11, %r11
- jnz 1f
xorl %eax, %eax
xorl %ebx, %ebx
xorl %ecx, %ecx
@@ -208,22 +204,36 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
ret
int3
-1:
+.Lrelocate:
popq %rdx
+
+ /* Use the swap page for the callee's stack */
+ movq kexec_pa_swap_page(%rip), %r10
leaq PAGE_SIZE(%r10), %rsp
+
+ /* push the existing entry point onto the callee's stack */
+ pushq %rdx
+
ANNOTATE_RETPOLINE_SAFE
call *%rdx
/* get the re-entry point of the peer system */
- movq 0(%rsp), %rbp
- leaq relocate_kernel(%rip), %r8
- movq CP_PA_SWAP_PAGE(%r8), %r10
- movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
- movq CP_PA_TABLE_PAGE(%r8), %rax
+ popq %rbp
+ movq kexec_pa_swap_page(%rip), %r10
+ movq pa_backup_pages_map(%rip), %rdi
+ movq kexec_pa_table_page(%rip), %rax
movq %rax, %cr3
+
+ /* Find start (and end) of this physical mapping of control page */
+ leaq (%rip), %r8
+ ANNOTATE_NOENDBR
+ andq $PAGE_MASK, %r8
lea PAGE_SIZE(%r8), %rsp
+ movl $1, %r11d /* Ensure preserve_context flag is set */
call swap_pages
- movq $virtual_mapped, %rax
+ movq kexec_va_control_page(%rip), %rax
+0: addq $virtual_mapped - 0b, %rax
+ subq $__relocate_kernel_start - 0b, %rax
pushq %rax
ANNOTATE_UNRET_SAFE
ret
@@ -233,13 +243,21 @@ SYM_CODE_END(identity_mapped)
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // RET target, above
- movq RSP(%r8), %rsp
- movq CR4(%r8), %rax
+ movq saved_rsp(%rip), %rsp
+ movq saved_cr4(%rip), %rax
movq %rax, %cr4
- movq CR3(%r8), %rax
- movq CR0(%r8), %r8
+ movq saved_cr3(%rip), %rax
+ movq saved_cr0(%rip), %r8
movq %rax, %cr3
movq %r8, %cr0
+
+#ifdef CONFIG_KEXEC_JUMP
+ /* Saved in save_processor_state. */
+ movq $saved_context, %rax
+ lgdt saved_context_gdt_desc(%rax)
+#endif
+
+ /* relocate_kernel() returns the re-entry point for next time */
movq %rbp, %rax
popf
@@ -257,61 +275,69 @@ SYM_CODE_END(virtual_mapped)
/* Do the copies */
SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
UNWIND_HINT_END_OF_STACK
- movq %rdi, %rcx /* Put the page_list in %rcx */
+ /*
+ * %rdi indirection page
+ * %r11 preserve_context
+ */
+ movq %rdi, %rcx /* Put the indirection_page in %rcx */
xorl %edi, %edi
xorl %esi, %esi
- jmp 1f
+ jmp .Lstart /* Should start with an indirection record */
-0: /* top, read another word for the indirection page */
+.Lloop: /* top, read another word for the indirection page */
movq (%rbx), %rcx
addq $8, %rbx
-1:
+.Lstart:
testb $0x1, %cl /* is it a destination page? */
- jz 2f
+ jz .Lnotdest
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
- jmp 0b
-2:
+ jmp .Lloop
+.Lnotdest:
testb $0x2, %cl /* is it an indirection page? */
- jz 2f
+ jz .Lnotind
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
- jmp 0b
-2:
+ jmp .Lloop
+.Lnotind:
testb $0x4, %cl /* is it the done indicator? */
- jz 2f
- jmp 3f
-2:
+ jz .Lnotdone
+ jmp .Ldone
+.Lnotdone:
testb $0x8, %cl /* is it the source indicator? */
- jz 0b /* Ignore it otherwise */
+ jz .Lloop /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
- movq %rdi, %rdx
- movq %rsi, %rax
+ movq %rdi, %rdx /* Save destination page to %rdx */
+ movq %rsi, %rax /* Save source page to %rax */
- movq %r10, %rdi
+ testq %r11, %r11 /* Only actually swap for ::preserve_context */
+ jz .Lnoswap
+
+ /* copy source page to swap page */
+ movq kexec_pa_swap_page(%rip), %rdi
movl $512, %ecx
rep ; movsq
+ /* copy destination page to source page */
movq %rax, %rdi
movq %rdx, %rsi
movl $512, %ecx
rep ; movsq
+ /* copy swap page to destination page */
movq %rdx, %rdi
- movq %r10, %rsi
+ movq kexec_pa_swap_page(%rip), %rsi
+.Lnoswap:
movl $512, %ecx
rep ; movsq
lea PAGE_SIZE(%rax), %rsi
- jmp 0b
-3:
+ jmp .Lloop
+.Ldone:
ANNOTATE_UNRET_SAFE
ret
int3
SYM_CODE_END(swap_pages)
-
- .skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
-SYM_CODE_END(relocate_range);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1309b9b05338..51a849a79c98 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -10,7 +10,6 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/intel-mid.h>
#include <asm/setup.h>
#ifdef CONFIG_X86_32
@@ -67,7 +66,7 @@ void mach_get_cmos_time(struct timespec64 *now)
return;
}
- if (mc146818_get_time(&tm)) {
+ if (mc146818_get_time(&tm, 1000)) {
pr_err("Unable to read current time from RTC\n");
now->tv_sec = now->tv_nsec = 0;
return;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 1526747bedf2..cebee310e200 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -7,9 +7,9 @@
*/
#include <linux/acpi.h>
#include <linux/console.h>
+#include <linux/cpu.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
-#include <linux/dmi.h>
#include <linux/efi.h>
#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
@@ -36,6 +36,7 @@
#include <asm/bios_ebda.h>
#include <asm/bugs.h>
#include <asm/cacheinfo.h>
+#include <asm/coco.h>
#include <asm/cpu.h>
#include <asm/efi.h>
#include <asm/gart.h>
@@ -163,7 +164,8 @@ unsigned long saved_video_mode;
static char __initdata command_line[COMMAND_LINE_SIZE];
#ifdef CONFIG_CMDLINE_BOOL
-static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+char builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+bool builtin_cmdline_added __ro_after_init;
#endif
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -226,8 +228,6 @@ static void __init reserve_brk(void)
_brk_start = 0;
}
-u64 relocated_ramdisk;
-
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
@@ -259,9 +259,10 @@ static void __init relocate_initrd(void)
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
+ int ret = 0;
/* We need to move the initrd down into directly mapped mem */
- relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
+ u64 relocated_ramdisk = memblock_phys_alloc_range(area_size, PAGE_SIZE, 0,
PFN_PHYS(max_pfn_mapped));
if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
@@ -272,7 +273,9 @@ static void __init relocate_initrd(void)
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+ ret = copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
+ if (ret)
+ panic("Copy RAMDISK failed\n");
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
@@ -473,7 +476,7 @@ static void __init arch_reserve_crashkernel(void)
bool high = false;
int ret;
- if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+ if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
@@ -755,6 +758,23 @@ void __init setup_arch(char **cmdline_p)
boot_cpu_data.x86_phys_bits = MAX_PHYSMEM_BITS;
#endif
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+ if (builtin_cmdline[0]) {
+ /* append boot loader cmdline to builtin */
+ strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+ strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+ strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+ }
+#endif
+ builtin_cmdline_added = true;
+#endif
+
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
/*
* If we have OLPC OFW, we might end up relocating the fixmap due to
* reserve_top(), so do this before touching the ioremap area.
@@ -834,22 +854,6 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = __pa_symbol(__bss_start);
bss_resource.end = __pa_symbol(__bss_stop)-1;
-#ifdef CONFIG_CMDLINE_BOOL
-#ifdef CONFIG_CMDLINE_OVERRIDE
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
-#else
- if (builtin_cmdline[0]) {
- /* append boot loader cmdline to builtin */
- strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
- strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- strscpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
- }
-#endif
-#endif
-
- strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
- *cmdline_p = command_line;
-
/*
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
@@ -904,7 +908,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
reserve_ibft_region();
- dmi_setup();
+ x86_init.resources.dmi_setup();
/*
* VMware detection requires dmi to be available, so this
@@ -972,10 +976,8 @@ void __init setup_arch(char **cmdline_p)
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
+ /* Find and reserve MPTABLE area */
+ x86_init.mpparse.find_mptable();
early_alloc_pgt_buf();
@@ -996,8 +998,8 @@ void __init setup_arch(char **cmdline_p)
* memory size.
*/
mem_encrypt_setup_arch();
+ cc_random_init();
- efi_fake_memmap();
efi_find_mirror();
efi_esrt_init();
efi_mokvar_table_init();
@@ -1033,12 +1035,19 @@ void __init setup_arch(char **cmdline_p)
*
* Moreover, on machines with SandyBridge graphics or in setups that use
* crashkernel the entire 1M is reserved anyway.
+ *
+ * Note the host kernel TDX also requires the first 1MB being reserved.
*/
x86_platform.realmode_reserve();
init_mem_mapping();
- idt_setup_early_pf();
+ /*
+ * init_mem_mapping() relies on the early IDT page fault handling.
+ * Now either enable FRED or install the real page fault handler
+ * for 64-bit in the IDT.
+ */
+ cpu_init_replace_early_idt();
/*
* Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
@@ -1090,7 +1099,9 @@ void __init setup_arch(char **cmdline_p)
early_platform_quirks();
+ /* Some platforms need the APIC registered for NUMA configuration */
early_acpi_boot_init();
+ x86_init.mpparse.early_parse_smp_cfg();
x86_flattree_get_config();
@@ -1106,8 +1117,6 @@ void __init setup_arch(char **cmdline_p)
*/
arch_reserve_crashkernel();
- memblock_find_dma_reserve();
-
if (!early_xdbc_setup_hardware())
early_xdbc_register_console();
@@ -1131,24 +1140,19 @@ void __init setup_arch(char **cmdline_p)
early_quirks();
- /*
- * Read APIC and some other early information from ACPI tables.
- */
- acpi_boot_init();
- x86_dtb_init();
+ topology_apply_cmdline_limits_early();
/*
- * get boot-time SMP configuration:
+ * Parse SMP configuration. Try ACPI first and then the platform
+ * specific parser.
*/
- get_smp_config();
+ acpi_boot_init();
+ x86_init.mpparse.parse_smp_cfg();
- /*
- * Systems w/o ACPI and mptables might not have it mapped the local
- * APIC yet, but prefill_possible_map() might need to access it.
- */
+ /* Last opportunity to detect and map the local APIC */
init_apic_mappings();
- prefill_possible_map();
+ topology_init_possible_cpus();
init_cpu_to_node();
init_gi_nodes();
@@ -1222,3 +1226,10 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+#ifdef CONFIG_HOTPLUG_CPU
+bool arch_cpu_is_hotpluggable(int cpu)
+{
+ return cpu > 0;
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2c97bf7b56ae..b30d6e180df7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,8 +106,8 @@ void __init pcpu_populate_pte(unsigned long addr)
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
- struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
- 0xFFFFF);
+ struct desc_struct d = GDT_ENTRY_INIT(DESC_DATA32,
+ per_cpu_offset(cpu), 0xFFFFF);
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
#endif
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
deleted file mode 100644
index ccb0915e84e1..000000000000
--- a/arch/x86/kernel/sev-shared.c
+++ /dev/null
@@ -1,1172 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * AMD Encrypted Register State Support
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- *
- * This file is not compiled stand-alone. It contains code shared
- * between the pre-decompression boot code and the running Linux kernel
- * and is included directly into both code-bases.
- */
-
-#ifndef __BOOT_COMPRESSED
-#define error(v) pr_err(v)
-#define has_cpuflag(f) boot_cpu_has(f)
-#else
-#undef WARN
-#define WARN(condition, format...) (!!(condition))
-#endif
-
-/* I/O parameters for CPUID-related helpers */
-struct cpuid_leaf {
- u32 fn;
- u32 subfn;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
-};
-
-/*
- * Individual entries of the SNP CPUID table, as defined by the SNP
- * Firmware ABI, Revision 0.9, Section 7.1, Table 14.
- */
-struct snp_cpuid_fn {
- u32 eax_in;
- u32 ecx_in;
- u64 xcr0_in;
- u64 xss_in;
- u32 eax;
- u32 ebx;
- u32 ecx;
- u32 edx;
- u64 __reserved;
-} __packed;
-
-/*
- * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9,
- * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit
- * of 64 entries per CPUID table.
- */
-#define SNP_CPUID_COUNT_MAX 64
-
-struct snp_cpuid_table {
- u32 count;
- u32 __reserved1;
- u64 __reserved2;
- struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX];
-} __packed;
-
-/*
- * Since feature negotiation related variables are set early in the boot
- * process they must reside in the .data section so as not to be zeroed
- * out when the .bss section is later cleared.
- *
- * GHCB protocol version negotiated with the hypervisor.
- */
-static u16 ghcb_version __ro_after_init;
-
-/* Copy of the SNP firmware's CPUID page. */
-static struct snp_cpuid_table cpuid_table_copy __ro_after_init;
-
-/*
- * These will be initialized based on CPUID table so that non-present
- * all-zero leaves (for sparse tables) can be differentiated from
- * invalid/out-of-range leaves. This is needed since all-zero leaves
- * still need to be post-processed.
- */
-static u32 cpuid_std_range_max __ro_after_init;
-static u32 cpuid_hyp_range_max __ro_after_init;
-static u32 cpuid_ext_range_max __ro_after_init;
-
-static bool __init sev_es_check_cpu_features(void)
-{
- if (!has_cpuflag(X86_FEATURE_RDRAND)) {
- error("RDRAND instruction not supported - no trusted source of randomness available\n");
- return false;
- }
-
- return true;
-}
-
-static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason)
-{
- u64 val = GHCB_MSR_TERM_REQ;
-
- /* Tell the hypervisor what went wrong. */
- val |= GHCB_SEV_TERM_REASON(set, reason);
-
- /* Request Guest Termination from Hypvervisor */
- sev_es_wr_ghcb_msr(val);
- VMGEXIT();
-
- while (true)
- asm volatile("hlt\n" : : : "memory");
-}
-
-/*
- * The hypervisor features are available from GHCB version 2 onward.
- */
-static u64 get_hv_features(void)
-{
- u64 val;
-
- if (ghcb_version < 2)
- return 0;
-
- sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ);
- VMGEXIT();
-
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP)
- return 0;
-
- return GHCB_MSR_HV_FT_RESP_VAL(val);
-}
-
-static void snp_register_ghcb_early(unsigned long paddr)
-{
- unsigned long pfn = paddr >> PAGE_SHIFT;
- u64 val;
-
- sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn));
- VMGEXIT();
-
- val = sev_es_rd_ghcb_msr();
-
- /* If the response GPA is not ours then abort the guest */
- if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) ||
- (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER);
-}
-
-static bool sev_es_negotiate_protocol(void)
-{
- u64 val;
-
- /* Do the GHCB protocol version negotiation */
- sev_es_wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ);
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
-
- if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP)
- return false;
-
- if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN ||
- GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX)
- return false;
-
- ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX);
-
- return true;
-}
-
-static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
-{
- ghcb->save.sw_exit_code = 0;
- __builtin_memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
-}
-
-static bool vc_decoding_needed(unsigned long exit_code)
-{
- /* Exceptions don't require to decode the instruction */
- return !(exit_code >= SVM_EXIT_EXCP_BASE &&
- exit_code <= SVM_EXIT_LAST_EXCP);
-}
-
-static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
- struct pt_regs *regs,
- unsigned long exit_code)
-{
- enum es_result ret = ES_OK;
-
- memset(ctxt, 0, sizeof(*ctxt));
- ctxt->regs = regs;
-
- if (vc_decoding_needed(exit_code))
- ret = vc_decode_insn(ctxt);
-
- return ret;
-}
-
-static void vc_finish_insn(struct es_em_ctxt *ctxt)
-{
- ctxt->regs->ip += ctxt->insn.length;
-}
-
-static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- u32 ret;
-
- ret = ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0);
- if (!ret)
- return ES_OK;
-
- if (ret == 1) {
- u64 info = ghcb->save.sw_exit_info_2;
- unsigned long v = info & SVM_EVTINJ_VEC_MASK;
-
- /* Check if exception information from hypervisor is sane. */
- if ((info & SVM_EVTINJ_VALID) &&
- ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
- ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
- ctxt->fi.vector = v;
-
- if (info & SVM_EVTINJ_VALID_ERR)
- ctxt->fi.error_code = info >> 32;
-
- return ES_EXCEPTION;
- }
- }
-
- return ES_VMM_ERROR;
-}
-
-static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2)
-{
- /* Fill in protocol and format specifiers */
- ghcb->protocol_version = ghcb_version;
- ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
-
- ghcb_set_sw_exit_code(ghcb, exit_code);
- ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
- ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- return verify_exception_info(ghcb, ctxt);
-}
-
-static int __sev_cpuid_hv(u32 fn, int reg_idx, u32 *reg)
-{
- u64 val;
-
- sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, reg_idx));
- VMGEXIT();
- val = sev_es_rd_ghcb_msr();
- if (GHCB_RESP_CODE(val) != GHCB_MSR_CPUID_RESP)
- return -EIO;
-
- *reg = (val >> 32);
-
- return 0;
-}
-
-static int __sev_cpuid_hv_msr(struct cpuid_leaf *leaf)
-{
- int ret;
-
- /*
- * MSR protocol does not support fetching non-zero subfunctions, but is
- * sufficient to handle current early-boot cases. Should that change,
- * make sure to report an error rather than ignoring the index and
- * grabbing random values. If this issue arises in the future, handling
- * can be added here to use GHCB-page protocol for cases that occur late
- * enough in boot that GHCB page is available.
- */
- if (cpuid_function_is_indexed(leaf->fn) && leaf->subfn)
- return -EINVAL;
-
- ret = __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EAX, &leaf->eax);
- ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EBX, &leaf->ebx);
- ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_ECX, &leaf->ecx);
- ret = ret ? : __sev_cpuid_hv(leaf->fn, GHCB_CPUID_REQ_EDX, &leaf->edx);
-
- return ret;
-}
-
-static int __sev_cpuid_hv_ghcb(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
- u32 cr4 = native_read_cr4();
- int ret;
-
- ghcb_set_rax(ghcb, leaf->fn);
- ghcb_set_rcx(ghcb, leaf->subfn);
-
- if (cr4 & X86_CR4_OSXSAVE)
- /* Safe to read xcr0 */
- ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
- else
- /* xgetbv will cause #UD - use reset value for xcr0 */
- ghcb_set_xcr0(ghcb, 1);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) &&
- ghcb_rbx_is_valid(ghcb) &&
- ghcb_rcx_is_valid(ghcb) &&
- ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- leaf->eax = ghcb->save.rax;
- leaf->ebx = ghcb->save.rbx;
- leaf->ecx = ghcb->save.rcx;
- leaf->edx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static int sev_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
- return ghcb ? __sev_cpuid_hv_ghcb(ghcb, ctxt, leaf)
- : __sev_cpuid_hv_msr(leaf);
-}
-
-/*
- * This may be called early while still running on the initial identity
- * mapping. Use RIP-relative addressing to obtain the correct address
- * while running with the initial identity mapping as well as the
- * switch-over to kernel virtual addresses later.
- */
-static const struct snp_cpuid_table *snp_cpuid_get_table(void)
-{
- void *ptr;
-
- asm ("lea cpuid_table_copy(%%rip), %0"
- : "=r" (ptr)
- : "p" (&cpuid_table_copy));
-
- return ptr;
-}
-
-/*
- * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of
- * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0
- * and 1 based on the corresponding features enabled by a particular
- * combination of XCR0 and XSS registers so that a guest can look up the
- * version corresponding to the features currently enabled in its XCR0/XSS
- * registers. The only values that differ between these versions/table
- * entries is the enabled XSAVE area size advertised via EBX.
- *
- * While hypervisors may choose to make use of this support, it is more
- * robust/secure for a guest to simply find the entry corresponding to the
- * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the
- * XSAVE area size using subfunctions 2 through 64, as documented in APM
- * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here.
- *
- * Since base/legacy XSAVE area size is documented as 0x240, use that value
- * directly rather than relying on the base size in the CPUID table.
- *
- * Return: XSAVE area size on success, 0 otherwise.
- */
-static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
-{
- const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
- u64 xfeatures_found = 0;
- u32 xsave_size = 0x240;
- int i;
-
- for (i = 0; i < cpuid_table->count; i++) {
- const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
-
- if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64))
- continue;
- if (!(xfeatures_en & (BIT_ULL(e->ecx_in))))
- continue;
- if (xfeatures_found & (BIT_ULL(e->ecx_in)))
- continue;
-
- xfeatures_found |= (BIT_ULL(e->ecx_in));
-
- if (compacted)
- xsave_size += e->eax;
- else
- xsave_size = max(xsave_size, e->eax + e->ebx);
- }
-
- /*
- * Either the guest set unsupported XCR0/XSS bits, or the corresponding
- * entries in the CPUID table were not present. This is not a valid
- * state to be in.
- */
- if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2)))
- return 0;
-
- return xsave_size;
-}
-
-static bool
-snp_cpuid_get_validated_func(struct cpuid_leaf *leaf)
-{
- const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
- int i;
-
- for (i = 0; i < cpuid_table->count; i++) {
- const struct snp_cpuid_fn *e = &cpuid_table->fn[i];
-
- if (e->eax_in != leaf->fn)
- continue;
-
- if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn)
- continue;
-
- /*
- * For 0xD subfunctions 0 and 1, only use the entry corresponding
- * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0).
- * See the comments above snp_cpuid_calc_xsave_size() for more
- * details.
- */
- if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1))
- if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in)
- continue;
-
- leaf->eax = e->eax;
- leaf->ebx = e->ebx;
- leaf->ecx = e->ecx;
- leaf->edx = e->edx;
-
- return true;
- }
-
- return false;
-}
-
-static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
- if (sev_cpuid_hv(ghcb, ctxt, leaf))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
-}
-
-static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- struct cpuid_leaf *leaf)
-{
- struct cpuid_leaf leaf_hv = *leaf;
-
- switch (leaf->fn) {
- case 0x1:
- snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
-
- /* initial APIC ID */
- leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0));
- /* APIC enabled bit */
- leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9));
-
- /* OSXSAVE enabled bit */
- if (native_read_cr4() & X86_CR4_OSXSAVE)
- leaf->ecx |= BIT(27);
- break;
- case 0x7:
- /* OSPKE enabled bit */
- leaf->ecx &= ~BIT(4);
- if (native_read_cr4() & X86_CR4_PKE)
- leaf->ecx |= BIT(4);
- break;
- case 0xB:
- leaf_hv.subfn = 0;
- snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
-
- /* extended APIC ID */
- leaf->edx = leaf_hv.edx;
- break;
- case 0xD: {
- bool compacted = false;
- u64 xcr0 = 1, xss = 0;
- u32 xsave_size;
-
- if (leaf->subfn != 0 && leaf->subfn != 1)
- return 0;
-
- if (native_read_cr4() & X86_CR4_OSXSAVE)
- xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- if (leaf->subfn == 1) {
- /* Get XSS value if XSAVES is enabled. */
- if (leaf->eax & BIT(3)) {
- unsigned long lo, hi;
-
- asm volatile("rdmsr" : "=a" (lo), "=d" (hi)
- : "c" (MSR_IA32_XSS));
- xss = (hi << 32) | lo;
- }
-
- /*
- * The PPR and APM aren't clear on what size should be
- * encoded in 0xD:0x1:EBX when compaction is not enabled
- * by either XSAVEC (feature bit 1) or XSAVES (feature
- * bit 3) since SNP-capable hardware has these feature
- * bits fixed as 1. KVM sets it to 0 in this case, but
- * to avoid this becoming an issue it's safer to simply
- * treat this as unsupported for SNP guests.
- */
- if (!(leaf->eax & (BIT(1) | BIT(3))))
- return -EINVAL;
-
- compacted = true;
- }
-
- xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted);
- if (!xsave_size)
- return -EINVAL;
-
- leaf->ebx = xsave_size;
- }
- break;
- case 0x8000001E:
- snp_cpuid_hv(ghcb, ctxt, &leaf_hv);
-
- /* extended APIC ID */
- leaf->eax = leaf_hv.eax;
- /* compute ID */
- leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0));
- /* node ID */
- leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0));
- break;
- default:
- /* No fix-ups needed, use values as-is. */
- break;
- }
-
- return 0;
-}
-
-/*
- * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value
- * should be treated as fatal by caller.
- */
-static int snp_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpuid_leaf *leaf)
-{
- const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
-
- if (!cpuid_table->count)
- return -EOPNOTSUPP;
-
- if (!snp_cpuid_get_validated_func(leaf)) {
- /*
- * Some hypervisors will avoid keeping track of CPUID entries
- * where all values are zero, since they can be handled the
- * same as out-of-range values (all-zero). This is useful here
- * as well as it allows virtually all guest configurations to
- * work using a single SNP CPUID table.
- *
- * To allow for this, there is a need to distinguish between
- * out-of-range entries and in-range zero entries, since the
- * CPUID table entries are only a template that may need to be
- * augmented with additional values for things like
- * CPU-specific information during post-processing. So if it's
- * not in the table, set the values to zero. Then, if they are
- * within a valid CPUID range, proceed with post-processing
- * using zeros as the initial values. Otherwise, skip
- * post-processing and just return zeros immediately.
- */
- leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0;
-
- /* Skip post-processing for out-of-range zero leafs. */
- if (!(leaf->fn <= cpuid_std_range_max ||
- (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) ||
- (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max)))
- return 0;
- }
-
- return snp_cpuid_postprocess(ghcb, ctxt, leaf);
-}
-
-/*
- * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
- * page yet, so it only supports the MSR based communication with the
- * hypervisor and only the CPUID exit-code.
- */
-void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
-{
- unsigned int subfn = lower_bits(regs->cx, 32);
- unsigned int fn = lower_bits(regs->ax, 32);
- struct cpuid_leaf leaf;
- int ret;
-
- /* Only CPUID is supported via MSR protocol */
- if (exit_code != SVM_EXIT_CPUID)
- goto fail;
-
- leaf.fn = fn;
- leaf.subfn = subfn;
-
- ret = snp_cpuid(NULL, NULL, &leaf);
- if (!ret)
- goto cpuid_done;
-
- if (ret != -EOPNOTSUPP)
- goto fail;
-
- if (__sev_cpuid_hv_msr(&leaf))
- goto fail;
-
-cpuid_done:
- regs->ax = leaf.eax;
- regs->bx = leaf.ebx;
- regs->cx = leaf.ecx;
- regs->dx = leaf.edx;
-
- /*
- * This is a VC handler and the #VC is only raised when SEV-ES is
- * active, which means SEV must be active too. Do sanity checks on the
- * CPUID results to make sure the hypervisor does not trick the kernel
- * into the no-sev path. This could map sensitive data unencrypted and
- * make it accessible to the hypervisor.
- *
- * In particular, check for:
- * - Availability of CPUID leaf 0x8000001f
- * - SEV CPUID bit.
- *
- * The hypervisor might still report the wrong C-bit position, but this
- * can't be checked here.
- */
-
- if (fn == 0x80000000 && (regs->ax < 0x8000001f))
- /* SEV leaf check */
- goto fail;
- else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
- /* SEV bit */
- goto fail;
-
- /* Skip over the CPUID two-byte opcode */
- regs->ip += 2;
-
- return;
-
-fail:
- /* Terminate the guest */
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-}
-
-static enum es_result vc_insn_string_check(struct es_em_ctxt *ctxt,
- unsigned long address,
- bool write)
-{
- if (user_mode(ctxt->regs) && fault_in_kernel_space(address)) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_USER;
- ctxt->fi.cr2 = address;
- if (write)
- ctxt->fi.error_code |= X86_PF_WRITE;
-
- return ES_EXCEPTION;
- }
-
- return ES_OK;
-}
-
-static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
- void *src, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, b = backwards ? -1 : 1;
- unsigned long address = (unsigned long)src;
- enum es_result ret;
-
- ret = vc_insn_string_check(ctxt, address, false);
- if (ret != ES_OK)
- return ret;
-
- for (i = 0; i < count; i++) {
- void *s = src + (i * data_size * b);
- char *d = buf + (i * data_size);
-
- ret = vc_read_mem(ctxt, s, d, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
- void *dst, char *buf,
- unsigned int data_size,
- unsigned int count,
- bool backwards)
-{
- int i, s = backwards ? -1 : 1;
- unsigned long address = (unsigned long)dst;
- enum es_result ret;
-
- ret = vc_insn_string_check(ctxt, address, true);
- if (ret != ES_OK)
- return ret;
-
- for (i = 0; i < count; i++) {
- void *d = dst + (i * data_size * s);
- char *b = buf + (i * data_size);
-
- ret = vc_write_mem(ctxt, d, b, data_size);
- if (ret != ES_OK)
- break;
- }
-
- return ret;
-}
-
-#define IOIO_TYPE_STR BIT(2)
-#define IOIO_TYPE_IN 1
-#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
-#define IOIO_TYPE_OUT 0
-#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
-
-#define IOIO_REP BIT(3)
-
-#define IOIO_ADDR_64 BIT(9)
-#define IOIO_ADDR_32 BIT(8)
-#define IOIO_ADDR_16 BIT(7)
-
-#define IOIO_DATA_32 BIT(6)
-#define IOIO_DATA_16 BIT(5)
-#define IOIO_DATA_8 BIT(4)
-
-#define IOIO_SEG_ES (0 << 10)
-#define IOIO_SEG_DS (3 << 10)
-
-static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
-{
- struct insn *insn = &ctxt->insn;
- size_t size;
- u64 port;
-
- *exitinfo = 0;
-
- switch (insn->opcode.bytes[0]) {
- /* INS opcodes */
- case 0x6c:
- case 0x6d:
- *exitinfo |= IOIO_TYPE_INS;
- *exitinfo |= IOIO_SEG_ES;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* OUTS opcodes */
- case 0x6e:
- case 0x6f:
- *exitinfo |= IOIO_TYPE_OUTS;
- *exitinfo |= IOIO_SEG_DS;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* IN immediate opcodes */
- case 0xe4:
- case 0xe5:
- *exitinfo |= IOIO_TYPE_IN;
- port = (u8)insn->immediate.value & 0xffff;
- break;
-
- /* OUT immediate opcodes */
- case 0xe6:
- case 0xe7:
- *exitinfo |= IOIO_TYPE_OUT;
- port = (u8)insn->immediate.value & 0xffff;
- break;
-
- /* IN register opcodes */
- case 0xec:
- case 0xed:
- *exitinfo |= IOIO_TYPE_IN;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- /* OUT register opcodes */
- case 0xee:
- case 0xef:
- *exitinfo |= IOIO_TYPE_OUT;
- port = ctxt->regs->dx & 0xffff;
- break;
-
- default:
- return ES_DECODE_FAILED;
- }
-
- *exitinfo |= port << 16;
-
- switch (insn->opcode.bytes[0]) {
- case 0x6c:
- case 0x6e:
- case 0xe4:
- case 0xe6:
- case 0xec:
- case 0xee:
- /* Single byte opcodes */
- *exitinfo |= IOIO_DATA_8;
- size = 1;
- break;
- default:
- /* Length determined by instruction parsing */
- *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
- : IOIO_DATA_32;
- size = (insn->opnd_bytes == 2) ? 2 : 4;
- }
-
- switch (insn->addr_bytes) {
- case 2:
- *exitinfo |= IOIO_ADDR_16;
- break;
- case 4:
- *exitinfo |= IOIO_ADDR_32;
- break;
- case 8:
- *exitinfo |= IOIO_ADDR_64;
- break;
- }
-
- if (insn_has_rep_prefix(insn))
- *exitinfo |= IOIO_REP;
-
- return vc_ioio_check(ctxt, (u16)port, size);
-}
-
-static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u64 exit_info_1, exit_info_2;
- enum es_result ret;
-
- ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_STR) {
-
- /* (REP) INS/OUTS */
-
- bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
- unsigned int io_bytes, exit_bytes;
- unsigned int ghcb_count, op_count;
- unsigned long es_base;
- u64 sw_scratch;
-
- /*
- * For the string variants with rep prefix the amount of in/out
- * operations per #VC exception is limited so that the kernel
- * has a chance to take interrupts and re-schedule while the
- * instruction is emulated.
- */
- io_bytes = (exit_info_1 >> 4) & 0x7;
- ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
-
- op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
- exit_info_2 = min(op_count, ghcb_count);
- exit_bytes = exit_info_2 * io_bytes;
-
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- /* Read bytes of OUTS into the shared buffer */
- if (!(exit_info_1 & IOIO_TYPE_IN)) {
- ret = vc_insn_string_read(ctxt,
- (void *)(es_base + regs->si),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
- }
-
- /*
- * Issue an VMGEXIT to the HV to consume the bytes from the
- * shared buffer or to have it write them into the shared buffer
- * depending on the instruction: OUTS or INS.
- */
- sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
- ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
- exit_info_1, exit_info_2);
- if (ret != ES_OK)
- return ret;
-
- /* Read bytes from shared buffer into the guest's destination. */
- if (exit_info_1 & IOIO_TYPE_IN) {
- ret = vc_insn_string_write(ctxt,
- (void *)(es_base + regs->di),
- ghcb->shared_buffer, io_bytes,
- exit_info_2, df);
- if (ret)
- return ret;
-
- if (df)
- regs->di -= exit_bytes;
- else
- regs->di += exit_bytes;
- } else {
- if (df)
- regs->si -= exit_bytes;
- else
- regs->si += exit_bytes;
- }
-
- if (exit_info_1 & IOIO_REP)
- regs->cx -= exit_info_2;
-
- ret = regs->cx ? ES_RETRY : ES_OK;
-
- } else {
-
- /* IN/OUT into/from rAX */
-
- int bits = (exit_info_1 & 0x70) >> 1;
- u64 rax = 0;
-
- if (!(exit_info_1 & IOIO_TYPE_IN))
- rax = lower_bits(regs->ax, bits);
-
- ghcb_set_rax(ghcb, rax);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
- if (ret != ES_OK)
- return ret;
-
- if (exit_info_1 & IOIO_TYPE_IN) {
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
- regs->ax = lower_bits(ghcb->save.rax, bits);
- }
- }
-
- return ret;
-}
-
-static int vc_handle_cpuid_snp(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- struct cpuid_leaf leaf;
- int ret;
-
- leaf.fn = regs->ax;
- leaf.subfn = regs->cx;
- ret = snp_cpuid(ghcb, ctxt, &leaf);
- if (!ret) {
- regs->ax = leaf.eax;
- regs->bx = leaf.ebx;
- regs->cx = leaf.ecx;
- regs->dx = leaf.edx;
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- u32 cr4 = native_read_cr4();
- enum es_result ret;
- int snp_cpuid_ret;
-
- snp_cpuid_ret = vc_handle_cpuid_snp(ghcb, ctxt);
- if (!snp_cpuid_ret)
- return ES_OK;
- if (snp_cpuid_ret != -EOPNOTSUPP)
- return ES_VMM_ERROR;
-
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rcx(ghcb, regs->cx);
-
- if (cr4 & X86_CR4_OSXSAVE)
- /* Safe to read xcr0 */
- ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
- else
- /* xgetbv will cause #GP - use reset value for xcr0 */
- ghcb_set_xcr0(ghcb, 1);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) &&
- ghcb_rbx_is_valid(ghcb) &&
- ghcb_rcx_is_valid(ghcb) &&
- ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- regs->ax = ghcb->save.rax;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt,
- unsigned long exit_code)
-{
- bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
- enum es_result ret;
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
- (!rdtscp || ghcb_rcx_is_valid(ghcb))))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
- if (rdtscp)
- ctxt->regs->cx = ghcb->save.rcx;
-
- return ES_OK;
-}
-
-struct cc_setup_data {
- struct setup_data header;
- u32 cc_blob_address;
-};
-
-/*
- * Search for a Confidential Computing blob passed in as a setup_data entry
- * via the Linux Boot Protocol.
- */
-static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp)
-{
- struct cc_setup_data *sd = NULL;
- struct setup_data *hdr;
-
- hdr = (struct setup_data *)bp->hdr.setup_data;
-
- while (hdr) {
- if (hdr->type == SETUP_CC_BLOB) {
- sd = (struct cc_setup_data *)hdr;
- return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address;
- }
- hdr = (struct setup_data *)hdr->next;
- }
-
- return NULL;
-}
-
-/*
- * Initialize the kernel's copy of the SNP CPUID table, and set up the
- * pointer that will be used to access it.
- *
- * Maintaining a direct mapping of the SNP CPUID table used by firmware would
- * be possible as an alternative, but the approach is brittle since the
- * mapping needs to be updated in sync with all the changes to virtual memory
- * layout and related mapping facilities throughout the boot process.
- */
-static void __init setup_cpuid_table(const struct cc_blob_sev_info *cc_info)
-{
- const struct snp_cpuid_table *cpuid_table_fw, *cpuid_table;
- int i;
-
- if (!cc_info || !cc_info->cpuid_phys || cc_info->cpuid_len < PAGE_SIZE)
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
-
- cpuid_table_fw = (const struct snp_cpuid_table *)cc_info->cpuid_phys;
- if (!cpuid_table_fw->count || cpuid_table_fw->count > SNP_CPUID_COUNT_MAX)
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID);
-
- cpuid_table = snp_cpuid_get_table();
- memcpy((void *)cpuid_table, cpuid_table_fw, sizeof(*cpuid_table));
-
- /* Initialize CPUID ranges for range-checking. */
- for (i = 0; i < cpuid_table->count; i++) {
- const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
-
- if (fn->eax_in == 0x0)
- cpuid_std_range_max = fn->eax;
- else if (fn->eax_in == 0x40000000)
- cpuid_hyp_range_max = fn->eax;
- else if (fn->eax_in == 0x80000000)
- cpuid_ext_range_max = fn->eax;
- }
-}
-
-static void pvalidate_pages(struct snp_psc_desc *desc)
-{
- struct psc_entry *e;
- unsigned long vaddr;
- unsigned int size;
- unsigned int i;
- bool validate;
- int rc;
-
- for (i = 0; i <= desc->hdr.end_entry; i++) {
- e = &desc->entries[i];
-
- vaddr = (unsigned long)pfn_to_kaddr(e->gfn);
- size = e->pagesize ? RMP_PG_SIZE_2M : RMP_PG_SIZE_4K;
- validate = e->operation == SNP_PAGE_STATE_PRIVATE;
-
- rc = pvalidate(vaddr, size, validate);
- if (rc == PVALIDATE_FAIL_SIZEMISMATCH && size == RMP_PG_SIZE_2M) {
- unsigned long vaddr_end = vaddr + PMD_SIZE;
-
- for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
- rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
- if (rc)
- break;
- }
- }
-
- if (rc) {
- WARN(1, "Failed to validate address 0x%lx ret %d", vaddr, rc);
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
- }
- }
-}
-
-static int vmgexit_psc(struct ghcb *ghcb, struct snp_psc_desc *desc)
-{
- int cur_entry, end_entry, ret = 0;
- struct snp_psc_desc *data;
- struct es_em_ctxt ctxt;
-
- vc_ghcb_invalidate(ghcb);
-
- /* Copy the input desc into GHCB shared buffer */
- data = (struct snp_psc_desc *)ghcb->shared_buffer;
- memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc)));
-
- /*
- * As per the GHCB specification, the hypervisor can resume the guest
- * before processing all the entries. Check whether all the entries
- * are processed. If not, then keep retrying. Note, the hypervisor
- * will update the data memory directly to indicate the status, so
- * reference the data->hdr everywhere.
- *
- * The strategy here is to wait for the hypervisor to change the page
- * state in the RMP table before guest accesses the memory pages. If the
- * page state change was not successful, then later memory access will
- * result in a crash.
- */
- cur_entry = data->hdr.cur_entry;
- end_entry = data->hdr.end_entry;
-
- while (data->hdr.cur_entry <= data->hdr.end_entry) {
- ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
-
- /* This will advance the shared buffer data points to. */
- ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
-
- /*
- * Page State Change VMGEXIT can pass error code through
- * exit_info_2.
- */
- if (WARN(ret || ghcb->save.sw_exit_info_2,
- "SNP: PSC failed ret=%d exit_info_2=%llx\n",
- ret, ghcb->save.sw_exit_info_2)) {
- ret = 1;
- goto out;
- }
-
- /* Verify that reserved bit is not set */
- if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) {
- ret = 1;
- goto out;
- }
-
- /*
- * Sanity check that entry processing is not going backwards.
- * This will happen only if hypervisor is tricking us.
- */
- if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry,
-"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n",
- end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) {
- ret = 1;
- goto out;
- }
- }
-
-out:
- return ret;
-}
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
deleted file mode 100644
index c67285824e82..000000000000
--- a/arch/x86/kernel/sev.c
+++ /dev/null
@@ -1,2264 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * AMD Memory Encryption Support
- *
- * Copyright (C) 2019 SUSE
- *
- * Author: Joerg Roedel <jroedel@suse.de>
- */
-
-#define pr_fmt(fmt) "SEV: " fmt
-
-#include <linux/sched/debug.h> /* For show_regs() */
-#include <linux/percpu-defs.h>
-#include <linux/cc_platform.h>
-#include <linux/printk.h>
-#include <linux/mm_types.h>
-#include <linux/set_memory.h>
-#include <linux/memblock.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/cpumask.h>
-#include <linux/efi.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/psp-sev.h>
-#include <uapi/linux/sev-guest.h>
-
-#include <asm/cpu_entry_area.h>
-#include <asm/stacktrace.h>
-#include <asm/sev.h>
-#include <asm/insn-eval.h>
-#include <asm/fpu/xcr.h>
-#include <asm/processor.h>
-#include <asm/realmode.h>
-#include <asm/setup.h>
-#include <asm/traps.h>
-#include <asm/svm.h>
-#include <asm/smp.h>
-#include <asm/cpu.h>
-#include <asm/apic.h>
-#include <asm/cpuid.h>
-#include <asm/cmdline.h>
-
-#define DR7_RESET_VALUE 0x400
-
-/* AP INIT values as documented in the APM2 section "Processor Initialization State" */
-#define AP_INIT_CS_LIMIT 0xffff
-#define AP_INIT_DS_LIMIT 0xffff
-#define AP_INIT_LDTR_LIMIT 0xffff
-#define AP_INIT_GDTR_LIMIT 0xffff
-#define AP_INIT_IDTR_LIMIT 0xffff
-#define AP_INIT_TR_LIMIT 0xffff
-#define AP_INIT_RFLAGS_DEFAULT 0x2
-#define AP_INIT_DR6_DEFAULT 0xffff0ff0
-#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
-#define AP_INIT_XCR0_DEFAULT 0x1
-#define AP_INIT_X87_FTW_DEFAULT 0x5555
-#define AP_INIT_X87_FCW_DEFAULT 0x0040
-#define AP_INIT_CR0_DEFAULT 0x60000010
-#define AP_INIT_MXCSR_DEFAULT 0x1f80
-
-/* For early boot hypervisor communication in SEV-ES enabled guests */
-static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
-
-/*
- * Needs to be in the .data section because we need it NULL before bss is
- * cleared
- */
-static struct ghcb *boot_ghcb __section(".data");
-
-/* Bitmap of SEV features supported by the hypervisor */
-static u64 sev_hv_features __ro_after_init;
-
-/* #VC handler runtime per-CPU data */
-struct sev_es_runtime_data {
- struct ghcb ghcb_page;
-
- /*
- * Reserve one page per CPU as backup storage for the unencrypted GHCB.
- * It is needed when an NMI happens while the #VC handler uses the real
- * GHCB, and the NMI handler itself is causing another #VC exception. In
- * that case the GHCB content of the first handler needs to be backed up
- * and restored.
- */
- struct ghcb backup_ghcb;
-
- /*
- * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
- * There is no need for it to be atomic, because nothing is written to
- * the GHCB between the read and the write of ghcb_active. So it is safe
- * to use it when a nested #VC exception happens before the write.
- *
- * This is necessary for example in the #VC->NMI->#VC case when the NMI
- * happens while the first #VC handler uses the GHCB. When the NMI code
- * raises a second #VC handler it might overwrite the contents of the
- * GHCB written by the first handler. To avoid this the content of the
- * GHCB is saved and restored when the GHCB is detected to be in use
- * already.
- */
- bool ghcb_active;
- bool backup_ghcb_active;
-
- /*
- * Cached DR7 value - write it on DR7 writes and return it on reads.
- * That value will never make it to the real hardware DR7 as debugging
- * is currently unsupported in SEV-ES guests.
- */
- unsigned long dr7;
-};
-
-struct ghcb_state {
- struct ghcb *ghcb;
-};
-
-static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
-static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa);
-
-struct sev_config {
- __u64 debug : 1,
-
- /*
- * A flag used by __set_pages_state() that indicates when the
- * per-CPU GHCB has been created and registered and thus can be
- * used by the BSP instead of the early boot GHCB.
- *
- * For APs, the per-CPU GHCB is created before they are started
- * and registered upon startup, so this flag can be used globally
- * for the BSP and APs.
- */
- ghcbs_initialized : 1,
-
- __reserved : 62;
-};
-
-static struct sev_config sev_cfg __read_mostly;
-
-static __always_inline bool on_vc_stack(struct pt_regs *regs)
-{
- unsigned long sp = regs->sp;
-
- /* User-mode RSP is not trusted */
- if (user_mode(regs))
- return false;
-
- /* SYSCALL gap still has user-mode RSP */
- if (ip_within_syscall_gap(regs))
- return false;
-
- return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
-}
-
-/*
- * This function handles the case when an NMI is raised in the #VC
- * exception handler entry code, before the #VC handler has switched off
- * its IST stack. In this case, the IST entry for #VC must be adjusted,
- * so that any nested #VC exception will not overwrite the stack
- * contents of the interrupted #VC handler.
- *
- * The IST entry is adjusted unconditionally so that it can be also be
- * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
- * nested sev_es_ist_exit() call may adjust back the IST entry too
- * early.
- *
- * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
- * on the NMI IST stack, as they are only called from NMI handling code
- * right now.
- */
-void noinstr __sev_es_ist_enter(struct pt_regs *regs)
-{
- unsigned long old_ist, new_ist;
-
- /* Read old IST entry */
- new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
-
- /*
- * If NMI happened while on the #VC IST stack, set the new IST
- * value below regs->sp, so that the interrupted stack frame is
- * not overwritten by subsequent #VC exceptions.
- */
- if (on_vc_stack(regs))
- new_ist = regs->sp;
-
- /*
- * Reserve additional 8 bytes and store old IST value so this
- * adjustment can be unrolled in __sev_es_ist_exit().
- */
- new_ist -= sizeof(old_ist);
- *(unsigned long *)new_ist = old_ist;
-
- /* Set new IST entry */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
-}
-
-void noinstr __sev_es_ist_exit(void)
-{
- unsigned long ist;
-
- /* Read IST entry */
- ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
-
- if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
- return;
-
- /* Read back old IST entry and write it to the TSS */
- this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
-}
-
-/*
- * Nothing shall interrupt this code path while holding the per-CPU
- * GHCB. The backup GHCB is only for NMIs interrupting this path.
- *
- * Callers must disable local interrupts around it.
- */
-static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- WARN_ON(!irqs_disabled());
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (unlikely(data->ghcb_active)) {
- /* GHCB is already in use - save its contents */
-
- if (unlikely(data->backup_ghcb_active)) {
- /*
- * Backup-GHCB is also already in use. There is no way
- * to continue here so just kill the machine. To make
- * panic() work, mark GHCBs inactive so that messages
- * can be printed out.
- */
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-
- instrumentation_begin();
- panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
- instrumentation_end();
- }
-
- /* Mark backup_ghcb active before writing to it */
- data->backup_ghcb_active = true;
-
- state->ghcb = &data->backup_ghcb;
-
- /* Backup GHCB content */
- *state->ghcb = *ghcb;
- } else {
- state->ghcb = NULL;
- data->ghcb_active = true;
- }
-
- return ghcb;
-}
-
-static inline u64 sev_es_rd_ghcb_msr(void)
-{
- return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
-}
-
-static __always_inline void sev_es_wr_ghcb_msr(u64 val)
-{
- u32 low, high;
-
- low = (u32)(val);
- high = (u32)(val >> 32);
-
- native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
-}
-
-static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
- unsigned char *buffer)
-{
- return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
-}
-
-static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- int insn_bytes;
-
- insn_bytes = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (insn_bytes == 0) {
- /* Nothing could be copied */
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- } else if (insn_bytes == -EINVAL) {
- /* Effective RIP could not be calculated */
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- ctxt->fi.cr2 = 0;
- return ES_EXCEPTION;
- }
-
- if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, insn_bytes))
- return ES_DECODE_FAILED;
-
- if (ctxt->insn.immediate.got)
- return ES_OK;
- else
- return ES_DECODE_FAILED;
-}
-
-static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
-{
- char buffer[MAX_INSN_SIZE];
- int res, ret;
-
- res = vc_fetch_insn_kernel(ctxt, buffer);
- if (res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
-
- ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
- if (ret < 0)
- return ES_DECODE_FAILED;
- else
- return ES_OK;
-}
-
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
-{
- if (user_mode(ctxt->regs))
- return __vc_decode_user_insn(ctxt);
- else
- return __vc_decode_kern_insn(ctxt);
-}
-
-static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
- char *dst, char *buf, size_t size)
-{
- unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
-
- /*
- * This function uses __put_user() independent of whether kernel or user
- * memory is accessed. This works fine because __put_user() does no
- * sanity checks of the pointer being accessed. All that it does is
- * to report when the access failed.
- *
- * Also, this function runs in atomic context, so __put_user() is not
- * allowed to sleep. The page-fault handler detects that it is running
- * in atomic context and will not try to take mmap_sem and handle the
- * fault, so additional pagefault_enable()/disable() calls are not
- * needed.
- *
- * The access can't be done via copy_to_user() here because
- * vc_write_mem() must not use string instructions to access unsafe
- * memory. The reason is that MOVS is emulated by the #VC handler by
- * splitting the move up into a read and a write and taking a nested #VC
- * exception on whatever of them is the MMIO access. Using string
- * instructions here would cause infinite nesting.
- */
- switch (size) {
- case 1: {
- u8 d1;
- u8 __user *target = (u8 __user *)dst;
-
- memcpy(&d1, buf, 1);
- if (__put_user(d1, target))
- goto fault;
- break;
- }
- case 2: {
- u16 d2;
- u16 __user *target = (u16 __user *)dst;
-
- memcpy(&d2, buf, 2);
- if (__put_user(d2, target))
- goto fault;
- break;
- }
- case 4: {
- u32 d4;
- u32 __user *target = (u32 __user *)dst;
-
- memcpy(&d4, buf, 4);
- if (__put_user(d4, target))
- goto fault;
- break;
- }
- case 8: {
- u64 d8;
- u64 __user *target = (u64 __user *)dst;
-
- memcpy(&d8, buf, 8);
- if (__put_user(d8, target))
- goto fault;
- break;
- }
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
- }
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
-
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)dst;
-
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
- char *src, char *buf, size_t size)
-{
- unsigned long error_code = X86_PF_PROT;
-
- /*
- * This function uses __get_user() independent of whether kernel or user
- * memory is accessed. This works fine because __get_user() does no
- * sanity checks of the pointer being accessed. All that it does is
- * to report when the access failed.
- *
- * Also, this function runs in atomic context, so __get_user() is not
- * allowed to sleep. The page-fault handler detects that it is running
- * in atomic context and will not try to take mmap_sem and handle the
- * fault, so additional pagefault_enable()/disable() calls are not
- * needed.
- *
- * The access can't be done via copy_from_user() here because
- * vc_read_mem() must not use string instructions to access unsafe
- * memory. The reason is that MOVS is emulated by the #VC handler by
- * splitting the move up into a read and a write and taking a nested #VC
- * exception on whatever of them is the MMIO access. Using string
- * instructions here would cause infinite nesting.
- */
- switch (size) {
- case 1: {
- u8 d1;
- u8 __user *s = (u8 __user *)src;
-
- if (__get_user(d1, s))
- goto fault;
- memcpy(buf, &d1, 1);
- break;
- }
- case 2: {
- u16 d2;
- u16 __user *s = (u16 __user *)src;
-
- if (__get_user(d2, s))
- goto fault;
- memcpy(buf, &d2, 2);
- break;
- }
- case 4: {
- u32 d4;
- u32 __user *s = (u32 __user *)src;
-
- if (__get_user(d4, s))
- goto fault;
- memcpy(buf, &d4, 4);
- break;
- }
- case 8: {
- u64 d8;
- u64 __user *s = (u64 __user *)src;
- if (__get_user(d8, s))
- goto fault;
- memcpy(buf, &d8, 8);
- break;
- }
- default:
- WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
- return ES_UNSUPPORTED;
- }
-
- return ES_OK;
-
-fault:
- if (user_mode(ctxt->regs))
- error_code |= X86_PF_USER;
-
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = error_code;
- ctxt->fi.cr2 = (unsigned long)src;
-
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned long vaddr, phys_addr_t *paddr)
-{
- unsigned long va = (unsigned long)vaddr;
- unsigned int level;
- phys_addr_t pa;
- pgd_t *pgd;
- pte_t *pte;
-
- pgd = __va(read_cr3_pa());
- pgd = &pgd[pgd_index(va)];
- pte = lookup_address_in_pgd(pgd, va, &level);
- if (!pte) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.cr2 = vaddr;
- ctxt->fi.error_code = 0;
-
- if (user_mode(ctxt->regs))
- ctxt->fi.error_code |= X86_PF_USER;
-
- return ES_EXCEPTION;
- }
-
- if (WARN_ON_ONCE(pte_val(*pte) & _PAGE_ENC))
- /* Emulated MMIO to/from encrypted memory not supported */
- return ES_UNSUPPORTED;
-
- pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
- pa |= va & ~page_level_mask(level);
-
- *paddr = pa;
-
- return ES_OK;
-}
-
-static enum es_result vc_ioio_check(struct es_em_ctxt *ctxt, u16 port, size_t size)
-{
- BUG_ON(size > 4);
-
- if (user_mode(ctxt->regs)) {
- struct thread_struct *t = &current->thread;
- struct io_bitmap *iobm = t->io_bitmap;
- size_t idx;
-
- if (!iobm)
- goto fault;
-
- for (idx = port; idx < port + size; ++idx) {
- if (test_bit(idx, iobm->bitmap))
- goto fault;
- }
- }
-
- return ES_OK;
-
-fault:
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
-
- return ES_EXCEPTION;
-}
-
-/* Include code shared with pre-decompression boot stage */
-#include "sev-shared.c"
-
-static noinstr void __sev_put_ghcb(struct ghcb_state *state)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- WARN_ON(!irqs_disabled());
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- if (state->ghcb) {
- /* Restore GHCB from Backup */
- *ghcb = *state->ghcb;
- data->backup_ghcb_active = false;
- state->ghcb = NULL;
- } else {
- /*
- * Invalidate the GHCB so a VMGEXIT instruction issued
- * from userspace won't appear to be valid.
- */
- vc_ghcb_invalidate(ghcb);
- data->ghcb_active = false;
- }
-}
-
-void noinstr __sev_es_nmi_complete(void)
-{
- struct ghcb_state state;
- struct ghcb *ghcb;
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
- VMGEXIT();
-
- __sev_put_ghcb(&state);
-}
-
-static u64 __init get_secrets_page(void)
-{
- u64 pa_data = boot_params.cc_blob_address;
- struct cc_blob_sev_info info;
- void *map;
-
- /*
- * The CC blob contains the address of the secrets page, check if the
- * blob is present.
- */
- if (!pa_data)
- return 0;
-
- map = early_memremap(pa_data, sizeof(info));
- if (!map) {
- pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n");
- return 0;
- }
- memcpy(&info, map, sizeof(info));
- early_memunmap(map, sizeof(info));
-
- /* smoke-test the secrets page passed */
- if (!info.secrets_phys || info.secrets_len != PAGE_SIZE)
- return 0;
-
- return info.secrets_phys;
-}
-
-static u64 __init get_snp_jump_table_addr(void)
-{
- struct snp_secrets_page_layout *layout;
- void __iomem *mem;
- u64 pa, addr;
-
- pa = get_secrets_page();
- if (!pa)
- return 0;
-
- mem = ioremap_encrypted(pa, PAGE_SIZE);
- if (!mem) {
- pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n");
- return 0;
- }
-
- layout = (__force struct snp_secrets_page_layout *)mem;
-
- addr = layout->os_area.ap_jump_table_pa;
- iounmap(mem);
-
- return addr;
-}
-
-static u64 __init get_jump_table_addr(void)
-{
- struct ghcb_state state;
- unsigned long flags;
- struct ghcb *ghcb;
- u64 ret = 0;
-
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return get_snp_jump_table_addr();
-
- local_irq_save(flags);
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
- ghcb_sw_exit_info_2_is_valid(ghcb))
- ret = ghcb->save.sw_exit_info_2;
-
- __sev_put_ghcb(&state);
-
- local_irq_restore(flags);
-
- return ret;
-}
-
-static void early_set_pages_state(unsigned long vaddr, unsigned long paddr,
- unsigned long npages, enum psc_op op)
-{
- unsigned long paddr_end;
- u64 val;
- int ret;
-
- vaddr = vaddr & PAGE_MASK;
-
- paddr = paddr & PAGE_MASK;
- paddr_end = paddr + (npages << PAGE_SHIFT);
-
- while (paddr < paddr_end) {
- if (op == SNP_PAGE_STATE_SHARED) {
- /* Page validation must be rescinded before changing to shared */
- ret = pvalidate(vaddr, RMP_PG_SIZE_4K, false);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
- goto e_term;
- }
-
- /*
- * Use the MSR protocol because this function can be called before
- * the GHCB is established.
- */
- sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op));
- VMGEXIT();
-
- val = sev_es_rd_ghcb_msr();
-
- if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
- "Wrong PSC response code: 0x%x\n",
- (unsigned int)GHCB_RESP_CODE(val)))
- goto e_term;
-
- if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
- "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
- op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
- paddr, GHCB_MSR_PSC_RESP_VAL(val)))
- goto e_term;
-
- if (op == SNP_PAGE_STATE_PRIVATE) {
- /* Page validation must be performed after changing to private */
- ret = pvalidate(vaddr, RMP_PG_SIZE_4K, true);
- if (WARN(ret, "Failed to validate address 0x%lx ret %d", paddr, ret))
- goto e_term;
- }
-
- vaddr += PAGE_SIZE;
- paddr += PAGE_SIZE;
- }
-
- return;
-
-e_term:
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-}
-
-void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr,
- unsigned long npages)
-{
- /*
- * This can be invoked in early boot while running identity mapped, so
- * use an open coded check for SNP instead of using cc_platform_has().
- * This eliminates worries about jump tables or checking boot_cpu_data
- * in the cc_platform_has() function.
- */
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
- /*
- * Ask the hypervisor to mark the memory pages as private in the RMP
- * table.
- */
- early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
-}
-
-void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
- unsigned long npages)
-{
- /*
- * This can be invoked in early boot while running identity mapped, so
- * use an open coded check for SNP instead of using cc_platform_has().
- * This eliminates worries about jump tables or checking boot_cpu_data
- * in the cc_platform_has() function.
- */
- if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
- return;
-
- /* Ask hypervisor to mark the memory pages shared in the RMP table. */
- early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_SHARED);
-}
-
-void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op)
-{
- unsigned long vaddr, npages;
-
- vaddr = (unsigned long)__va(paddr);
- npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
-
- if (op == SNP_PAGE_STATE_PRIVATE)
- early_snp_set_memory_private(vaddr, paddr, npages);
- else if (op == SNP_PAGE_STATE_SHARED)
- early_snp_set_memory_shared(vaddr, paddr, npages);
- else
- WARN(1, "invalid memory op %d\n", op);
-}
-
-static unsigned long __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr,
- unsigned long vaddr_end, int op)
-{
- struct ghcb_state state;
- bool use_large_entry;
- struct psc_hdr *hdr;
- struct psc_entry *e;
- unsigned long flags;
- unsigned long pfn;
- struct ghcb *ghcb;
- int i;
-
- hdr = &data->hdr;
- e = data->entries;
-
- memset(data, 0, sizeof(*data));
- i = 0;
-
- while (vaddr < vaddr_end && i < ARRAY_SIZE(data->entries)) {
- hdr->end_entry = i;
-
- if (is_vmalloc_addr((void *)vaddr)) {
- pfn = vmalloc_to_pfn((void *)vaddr);
- use_large_entry = false;
- } else {
- pfn = __pa(vaddr) >> PAGE_SHIFT;
- use_large_entry = true;
- }
-
- e->gfn = pfn;
- e->operation = op;
-
- if (use_large_entry && IS_ALIGNED(vaddr, PMD_SIZE) &&
- (vaddr_end - vaddr) >= PMD_SIZE) {
- e->pagesize = RMP_PG_SIZE_2M;
- vaddr += PMD_SIZE;
- } else {
- e->pagesize = RMP_PG_SIZE_4K;
- vaddr += PAGE_SIZE;
- }
-
- e++;
- i++;
- }
-
- /* Page validation must be rescinded before changing to shared */
- if (op == SNP_PAGE_STATE_SHARED)
- pvalidate_pages(data);
-
- local_irq_save(flags);
-
- if (sev_cfg.ghcbs_initialized)
- ghcb = __sev_get_ghcb(&state);
- else
- ghcb = boot_ghcb;
-
- /* Invoke the hypervisor to perform the page state changes */
- if (!ghcb || vmgexit_psc(ghcb, data))
- sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC);
-
- if (sev_cfg.ghcbs_initialized)
- __sev_put_ghcb(&state);
-
- local_irq_restore(flags);
-
- /* Page validation must be performed after changing to private */
- if (op == SNP_PAGE_STATE_PRIVATE)
- pvalidate_pages(data);
-
- return vaddr;
-}
-
-static void set_pages_state(unsigned long vaddr, unsigned long npages, int op)
-{
- struct snp_psc_desc desc;
- unsigned long vaddr_end;
-
- /* Use the MSR protocol when a GHCB is not available. */
- if (!boot_ghcb)
- return early_set_pages_state(vaddr, __pa(vaddr), npages, op);
-
- vaddr = vaddr & PAGE_MASK;
- vaddr_end = vaddr + (npages << PAGE_SHIFT);
-
- while (vaddr < vaddr_end)
- vaddr = __set_pages_state(&desc, vaddr, vaddr_end, op);
-}
-
-void snp_set_memory_shared(unsigned long vaddr, unsigned long npages)
-{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return;
-
- set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED);
-}
-
-void snp_set_memory_private(unsigned long vaddr, unsigned long npages)
-{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return;
-
- set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
-}
-
-void snp_accept_memory(phys_addr_t start, phys_addr_t end)
-{
- unsigned long vaddr, npages;
-
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return;
-
- vaddr = (unsigned long)__va(start);
- npages = (end - start) >> PAGE_SHIFT;
-
- set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
-}
-
-static int snp_set_vmsa(void *va, bool vmsa)
-{
- u64 attrs;
-
- /*
- * Running at VMPL0 allows the kernel to change the VMSA bit for a page
- * using the RMPADJUST instruction. However, for the instruction to
- * succeed it must target the permissions of a lesser privileged
- * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
- * instruction in the AMD64 APM Volume 3).
- */
- attrs = 1;
- if (vmsa)
- attrs |= RMPADJUST_VMSA_PAGE_BIT;
-
- return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
-}
-
-#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK)
-#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK)
-#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK)
-
-#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2)
-#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3)
-
-static void *snp_alloc_vmsa_page(void)
-{
- struct page *p;
-
- /*
- * Allocate VMSA page to work around the SNP erratum where the CPU will
- * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB)
- * collides with the RMP entry of VMSA page. The recommended workaround
- * is to not use a large page.
- *
- * Allocate an 8k page which is also 8k-aligned.
- */
- p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
- if (!p)
- return NULL;
-
- split_page(p, 1);
-
- /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */
- __free_page(p);
-
- return page_address(p + 1);
-}
-
-static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
-{
- int err;
-
- err = snp_set_vmsa(vmsa, false);
- if (err)
- pr_err("clear VMSA page failed (%u), leaking page\n", err);
- else
- free_page((unsigned long)vmsa);
-}
-
-static int wakeup_cpu_via_vmgexit(u32 apic_id, unsigned long start_ip)
-{
- struct sev_es_save_area *cur_vmsa, *vmsa;
- struct ghcb_state state;
- unsigned long flags;
- struct ghcb *ghcb;
- u8 sipi_vector;
- int cpu, ret;
- u64 cr4;
-
- /*
- * The hypervisor SNP feature support check has happened earlier, just check
- * the AP_CREATION one here.
- */
- if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION))
- return -EOPNOTSUPP;
-
- /*
- * Verify the desired start IP against the known trampoline start IP
- * to catch any future new trampolines that may be introduced that
- * would require a new protected guest entry point.
- */
- if (WARN_ONCE(start_ip != real_mode_header->trampoline_start,
- "Unsupported SNP start_ip: %lx\n", start_ip))
- return -EINVAL;
-
- /* Override start_ip with known protected guest start IP */
- start_ip = real_mode_header->sev_es_trampoline_start;
-
- /* Find the logical CPU for the APIC ID */
- for_each_present_cpu(cpu) {
- if (arch_match_cpu_phys_id(cpu, apic_id))
- break;
- }
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
-
- cur_vmsa = per_cpu(sev_vmsa, cpu);
-
- /*
- * A new VMSA is created each time because there is no guarantee that
- * the current VMSA is the kernels or that the vCPU is not running. If
- * an attempt was done to use the current VMSA with a running vCPU, a
- * #VMEXIT of that vCPU would wipe out all of the settings being done
- * here.
- */
- vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page();
- if (!vmsa)
- return -ENOMEM;
-
- /* CR4 should maintain the MCE value */
- cr4 = native_read_cr4() & X86_CR4_MCE;
-
- /* Set the CS value based on the start_ip converted to a SIPI vector */
- sipi_vector = (start_ip >> 12);
- vmsa->cs.base = sipi_vector << 12;
- vmsa->cs.limit = AP_INIT_CS_LIMIT;
- vmsa->cs.attrib = INIT_CS_ATTRIBS;
- vmsa->cs.selector = sipi_vector << 8;
-
- /* Set the RIP value based on start_ip */
- vmsa->rip = start_ip & 0xfff;
-
- /* Set AP INIT defaults as documented in the APM */
- vmsa->ds.limit = AP_INIT_DS_LIMIT;
- vmsa->ds.attrib = INIT_DS_ATTRIBS;
- vmsa->es = vmsa->ds;
- vmsa->fs = vmsa->ds;
- vmsa->gs = vmsa->ds;
- vmsa->ss = vmsa->ds;
-
- vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT;
- vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT;
- vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS;
- vmsa->idtr.limit = AP_INIT_IDTR_LIMIT;
- vmsa->tr.limit = AP_INIT_TR_LIMIT;
- vmsa->tr.attrib = INIT_TR_ATTRIBS;
-
- vmsa->cr4 = cr4;
- vmsa->cr0 = AP_INIT_CR0_DEFAULT;
- vmsa->dr7 = DR7_RESET_VALUE;
- vmsa->dr6 = AP_INIT_DR6_DEFAULT;
- vmsa->rflags = AP_INIT_RFLAGS_DEFAULT;
- vmsa->g_pat = AP_INIT_GPAT_DEFAULT;
- vmsa->xcr0 = AP_INIT_XCR0_DEFAULT;
- vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT;
- vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT;
- vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT;
-
- /* SVME must be set. */
- vmsa->efer = EFER_SVME;
-
- /*
- * Set the SNP-specific fields for this VMSA:
- * VMPL level
- * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
- */
- vmsa->vmpl = 0;
- vmsa->sev_features = sev_status >> 2;
-
- /* Switch the page over to a VMSA page now that it is initialized */
- ret = snp_set_vmsa(vmsa, true);
- if (ret) {
- pr_err("set VMSA page failed (%u)\n", ret);
- free_page((unsigned long)vmsa);
-
- return -EINVAL;
- }
-
- /* Issue VMGEXIT AP Creation NAE event */
- local_irq_save(flags);
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- ghcb_set_rax(ghcb, vmsa->sev_features);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION);
- ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE);
- ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa));
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- if (!ghcb_sw_exit_info_1_is_valid(ghcb) ||
- lower_32_bits(ghcb->save.sw_exit_info_1)) {
- pr_err("SNP AP Creation error\n");
- ret = -EINVAL;
- }
-
- __sev_put_ghcb(&state);
-
- local_irq_restore(flags);
-
- /* Perform cleanup if there was an error */
- if (ret) {
- snp_cleanup_vmsa(vmsa);
- vmsa = NULL;
- }
-
- /* Free up any previous VMSA page */
- if (cur_vmsa)
- snp_cleanup_vmsa(cur_vmsa);
-
- /* Record the current VMSA page */
- per_cpu(sev_vmsa, cpu) = vmsa;
-
- return ret;
-}
-
-void __init snp_set_wakeup_secondary_cpu(void)
-{
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return;
-
- /*
- * Always set this override if SNP is enabled. This makes it the
- * required method to start APs under SNP. If the hypervisor does
- * not support AP creation, then no APs will be started.
- */
- apic_update_callback(wakeup_secondary_cpu, wakeup_cpu_via_vmgexit);
-}
-
-int __init sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
-{
- u16 startup_cs, startup_ip;
- phys_addr_t jump_table_pa;
- u64 jump_table_addr;
- u16 __iomem *jump_table;
-
- jump_table_addr = get_jump_table_addr();
-
- /* On UP guests there is no jump table so this is not a failure */
- if (!jump_table_addr)
- return 0;
-
- /* Check if AP Jump Table is page-aligned */
- if (jump_table_addr & ~PAGE_MASK)
- return -EINVAL;
-
- jump_table_pa = jump_table_addr & PAGE_MASK;
-
- startup_cs = (u16)(rmh->trampoline_start >> 4);
- startup_ip = (u16)(rmh->sev_es_trampoline_start -
- rmh->trampoline_start);
-
- jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
- if (!jump_table)
- return -EIO;
-
- writew(startup_ip, &jump_table[0]);
- writew(startup_cs, &jump_table[1]);
-
- iounmap(jump_table);
-
- return 0;
-}
-
-/*
- * This is needed by the OVMF UEFI firmware which will use whatever it finds in
- * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
- * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
- */
-int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
-{
- struct sev_es_runtime_data *data;
- unsigned long address, pflags;
- int cpu;
- u64 pfn;
-
- if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- return 0;
-
- pflags = _PAGE_NX | _PAGE_RW;
-
- for_each_possible_cpu(cpu) {
- data = per_cpu(runtime_data, cpu);
-
- address = __pa(&data->ghcb_page);
- pfn = address >> PAGE_SHIFT;
-
- if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
- return 1;
- }
-
- return 0;
-}
-
-static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct pt_regs *regs = ctxt->regs;
- enum es_result ret;
- u64 exit_info_1;
-
- /* Is it a WRMSR? */
- exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
-
- ghcb_set_rcx(ghcb, regs->cx);
- if (exit_info_1) {
- ghcb_set_rax(ghcb, regs->ax);
- ghcb_set_rdx(ghcb, regs->dx);
- }
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
-
- if ((ret == ES_OK) && (!exit_info_1)) {
- regs->ax = ghcb->save.rax;
- regs->dx = ghcb->save.rdx;
- }
-
- return ret;
-}
-
-static void snp_register_per_cpu_ghcb(void)
-{
- struct sev_es_runtime_data *data;
- struct ghcb *ghcb;
-
- data = this_cpu_read(runtime_data);
- ghcb = &data->ghcb_page;
-
- snp_register_ghcb_early(__pa(ghcb));
-}
-
-void setup_ghcb(void)
-{
- if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- return;
-
- /*
- * Check whether the runtime #VC exception handler is active. It uses
- * the per-CPU GHCB page which is set up by sev_es_init_vc_handling().
- *
- * If SNP is active, register the per-CPU GHCB page so that the runtime
- * exception handler can use it.
- */
- if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) {
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- snp_register_per_cpu_ghcb();
-
- sev_cfg.ghcbs_initialized = true;
-
- return;
- }
-
- /*
- * Make sure the hypervisor talks a supported protocol.
- * This gets called only in the BSP boot phase.
- */
- if (!sev_es_negotiate_protocol())
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
- /*
- * Clear the boot_ghcb. The first exception comes in before the bss
- * section is cleared.
- */
- memset(&boot_ghcb_page, 0, PAGE_SIZE);
-
- /* Alright - Make the boot-ghcb public */
- boot_ghcb = &boot_ghcb_page;
-
- /* SNP guest requires that GHCB GPA must be registered. */
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- snp_register_ghcb_early(__pa(&boot_ghcb_page));
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static void sev_es_ap_hlt_loop(void)
-{
- struct ghcb_state state;
- struct ghcb *ghcb;
-
- ghcb = __sev_get_ghcb(&state);
-
- while (true) {
- vc_ghcb_invalidate(ghcb);
- ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
- ghcb_set_sw_exit_info_1(ghcb, 0);
- ghcb_set_sw_exit_info_2(ghcb, 0);
-
- sev_es_wr_ghcb_msr(__pa(ghcb));
- VMGEXIT();
-
- /* Wakeup signal? */
- if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
- ghcb->save.sw_exit_info_2)
- break;
- }
-
- __sev_put_ghcb(&state);
-}
-
-/*
- * Play_dead handler when running under SEV-ES. This is needed because
- * the hypervisor can't deliver an SIPI request to restart the AP.
- * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
- * hypervisor wakes it up again.
- */
-static void sev_es_play_dead(void)
-{
- play_dead_common();
-
- /* IRQs now disabled */
-
- sev_es_ap_hlt_loop();
-
- /*
- * If we get here, the VCPU was woken up again. Jump to CPU
- * startup code to get it back online.
- */
- soft_restart_cpu();
-}
-#else /* CONFIG_HOTPLUG_CPU */
-#define sev_es_play_dead native_play_dead
-#endif /* CONFIG_HOTPLUG_CPU */
-
-#ifdef CONFIG_SMP
-static void __init sev_es_setup_play_dead(void)
-{
- smp_ops.play_dead = sev_es_play_dead;
-}
-#else
-static inline void sev_es_setup_play_dead(void) { }
-#endif
-
-static void __init alloc_runtime_data(int cpu)
-{
- struct sev_es_runtime_data *data;
-
- data = memblock_alloc(sizeof(*data), PAGE_SIZE);
- if (!data)
- panic("Can't allocate SEV-ES runtime data");
-
- per_cpu(runtime_data, cpu) = data;
-}
-
-static void __init init_ghcb(int cpu)
-{
- struct sev_es_runtime_data *data;
- int err;
-
- data = per_cpu(runtime_data, cpu);
-
- err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
- sizeof(data->ghcb_page));
- if (err)
- panic("Can't map GHCBs unencrypted");
-
- memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
-
- data->ghcb_active = false;
- data->backup_ghcb_active = false;
-}
-
-void __init sev_es_init_vc_handling(void)
-{
- int cpu;
-
- BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
-
- if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
- return;
-
- if (!sev_es_check_cpu_features())
- panic("SEV-ES CPU Features missing");
-
- /*
- * SNP is supported in v2 of the GHCB spec which mandates support for HV
- * features.
- */
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
- sev_hv_features = get_hv_features();
-
- if (!(sev_hv_features & GHCB_HV_FT_SNP))
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
- }
-
- /* Initialize per-cpu GHCB pages */
- for_each_possible_cpu(cpu) {
- alloc_runtime_data(cpu);
- init_ghcb(cpu);
- }
-
- sev_es_setup_play_dead();
-
- /* Secondary CPUs use the runtime #VC handler */
- initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
-}
-
-static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
-{
- int trapnr = ctxt->fi.vector;
-
- if (trapnr == X86_TRAP_PF)
- native_write_cr2(ctxt->fi.cr2);
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
- do_early_exception(ctxt->regs, trapnr);
-}
-
-static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
-{
- long *reg_array;
- int offset;
-
- reg_array = (long *)ctxt->regs;
- offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
-
- if (offset < 0)
- return NULL;
-
- offset /= sizeof(long);
-
- return reg_array + offset;
-}
-static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
- unsigned int bytes, bool read)
-{
- u64 exit_code, exit_info_1, exit_info_2;
- unsigned long ghcb_pa = __pa(ghcb);
- enum es_result res;
- phys_addr_t paddr;
- void __user *ref;
-
- ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
- if (ref == (void __user *)-1L)
- return ES_UNSUPPORTED;
-
- exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
-
- res = vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr);
- if (res != ES_OK) {
- if (res == ES_EXCEPTION && !read)
- ctxt->fi.error_code |= X86_PF_WRITE;
-
- return res;
- }
-
- exit_info_1 = paddr;
- /* Can never be greater than 8 */
- exit_info_2 = bytes;
-
- ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
-
- return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
-}
-
-/*
- * The MOVS instruction has two memory operands, which raises the
- * problem that it is not known whether the access to the source or the
- * destination caused the #VC exception (and hence whether an MMIO read
- * or write operation needs to be emulated).
- *
- * Instead of playing games with walking page-tables and trying to guess
- * whether the source or destination is an MMIO range, split the move
- * into two operations, a read and a write with only one memory operand.
- * This will cause a nested #VC exception on the MMIO address which can
- * then be handled.
- *
- * This implementation has the benefit that it also supports MOVS where
- * source _and_ destination are MMIO regions.
- *
- * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
- * rare operation. If it turns out to be a performance problem the split
- * operations can be moved to memcpy_fromio() and memcpy_toio().
- */
-static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
- unsigned int bytes)
-{
- unsigned long ds_base, es_base;
- unsigned char *src, *dst;
- unsigned char buffer[8];
- enum es_result ret;
- bool rep;
- int off;
-
- ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
- es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
-
- if (ds_base == -1L || es_base == -1L) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- src = ds_base + (unsigned char *)ctxt->regs->si;
- dst = es_base + (unsigned char *)ctxt->regs->di;
-
- ret = vc_read_mem(ctxt, src, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- ret = vc_write_mem(ctxt, dst, buffer, bytes);
- if (ret != ES_OK)
- return ret;
-
- if (ctxt->regs->flags & X86_EFLAGS_DF)
- off = -bytes;
- else
- off = bytes;
-
- ctxt->regs->si += off;
- ctxt->regs->di += off;
-
- rep = insn_has_rep_prefix(&ctxt->insn);
- if (rep)
- ctxt->regs->cx -= 1;
-
- if (!rep || ctxt->regs->cx == 0)
- return ES_OK;
- else
- return ES_RETRY;
-}
-
-static enum es_result vc_handle_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- struct insn *insn = &ctxt->insn;
- enum insn_mmio_type mmio;
- unsigned int bytes = 0;
- enum es_result ret;
- u8 sign_byte;
- long *reg_data;
-
- mmio = insn_decode_mmio(insn, &bytes);
- if (mmio == INSN_MMIO_DECODE_FAILED)
- return ES_DECODE_FAILED;
-
- if (mmio != INSN_MMIO_WRITE_IMM && mmio != INSN_MMIO_MOVS) {
- reg_data = insn_get_modrm_reg_ptr(insn, ctxt->regs);
- if (!reg_data)
- return ES_DECODE_FAILED;
- }
-
- if (user_mode(ctxt->regs))
- return ES_UNSUPPORTED;
-
- switch (mmio) {
- case INSN_MMIO_WRITE:
- memcpy(ghcb->shared_buffer, reg_data, bytes);
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
- case INSN_MMIO_WRITE_IMM:
- memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
- ret = vc_do_mmio(ghcb, ctxt, bytes, false);
- break;
- case INSN_MMIO_READ:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero-extend for 32-bit operation */
- if (bytes == 4)
- *reg_data = 0;
-
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_READ_ZERO_EXTEND:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- /* Zero extend based on operand size */
- memset(reg_data, 0, insn->opnd_bytes);
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_READ_SIGN_EXTEND:
- ret = vc_do_mmio(ghcb, ctxt, bytes, true);
- if (ret)
- break;
-
- if (bytes == 1) {
- u8 *val = (u8 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x80) ? 0xff : 0x00;
- } else {
- u16 *val = (u16 *)ghcb->shared_buffer;
-
- sign_byte = (*val & 0x8000) ? 0xff : 0x00;
- }
-
- /* Sign extend based on operand size */
- memset(reg_data, sign_byte, insn->opnd_bytes);
- memcpy(reg_data, ghcb->shared_buffer, bytes);
- break;
- case INSN_MMIO_MOVS:
- ret = vc_handle_mmio_movs(ctxt, bytes);
- break;
- default:
- ret = ES_UNSUPPORTED;
- break;
- }
-
- return ret;
-}
-
-static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long val, *reg = vc_insn_get_rm(ctxt);
- enum es_result ret;
-
- if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
- return ES_VMM_ERROR;
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- val = *reg;
-
- /* Upper 32 bits must be written as zeroes */
- if (val >> 32) {
- ctxt->fi.vector = X86_TRAP_GP;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
- }
-
- /* Clear out other reserved bits and set bit 10 */
- val = (val & 0xffff23ffL) | BIT(10);
-
- /* Early non-zero writes to DR7 are not supported */
- if (!data && (val & ~DR7_RESET_VALUE))
- return ES_UNSUPPORTED;
-
- /* Using a value of 0 for ExitInfo1 means RAX holds the value */
- ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (data)
- data->dr7 = val;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
- long *reg = vc_insn_get_rm(ctxt);
-
- if (sev_status & MSR_AMD64_SNP_DEBUG_SWAP)
- return ES_VMM_ERROR;
-
- if (!reg)
- return ES_DECODE_FAILED;
-
- if (data)
- *reg = data->dr7;
- else
- *reg = DR7_RESET_VALUE;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
-}
-
-static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rcx(ghcb, ctxt->regs->cx);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
- ctxt->regs->dx = ghcb->save.rdx;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_monitor(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Treat it as a NOP and do not leak a physical address to the
- * hypervisor.
- */
- return ES_OK;
-}
-
-static enum es_result vc_handle_mwait(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /* Treat the same as MONITOR/MONITORX */
- return ES_OK;
-}
-
-static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- enum es_result ret;
-
- ghcb_set_rax(ghcb, ctxt->regs->ax);
- ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
-
- if (x86_platform.hyper.sev_es_hcall_prepare)
- x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
-
- ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
- if (ret != ES_OK)
- return ret;
-
- if (!ghcb_rax_is_valid(ghcb))
- return ES_VMM_ERROR;
-
- ctxt->regs->ax = ghcb->save.rax;
-
- /*
- * Call sev_es_hcall_finish() after regs->ax is already set.
- * This allows the hypervisor handler to overwrite it again if
- * necessary.
- */
- if (x86_platform.hyper.sev_es_hcall_finish &&
- !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
- return ES_VMM_ERROR;
-
- return ES_OK;
-}
-
-static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
- struct es_em_ctxt *ctxt)
-{
- /*
- * Calling ecx_alignment_check() directly does not work, because it
- * enables IRQs and the GHCB is active. Forward the exception and call
- * it later from vc_forward_exception().
- */
- ctxt->fi.vector = X86_TRAP_AC;
- ctxt->fi.error_code = 0;
- return ES_EXCEPTION;
-}
-
-static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
- struct ghcb *ghcb,
- unsigned long exit_code)
-{
- enum es_result result;
-
- switch (exit_code) {
- case SVM_EXIT_READ_DR7:
- result = vc_handle_dr7_read(ghcb, ctxt);
- break;
- case SVM_EXIT_WRITE_DR7:
- result = vc_handle_dr7_write(ghcb, ctxt);
- break;
- case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
- result = vc_handle_trap_ac(ghcb, ctxt);
- break;
- case SVM_EXIT_RDTSC:
- case SVM_EXIT_RDTSCP:
- result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
- break;
- case SVM_EXIT_RDPMC:
- result = vc_handle_rdpmc(ghcb, ctxt);
- break;
- case SVM_EXIT_INVD:
- pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
- result = ES_UNSUPPORTED;
- break;
- case SVM_EXIT_CPUID:
- result = vc_handle_cpuid(ghcb, ctxt);
- break;
- case SVM_EXIT_IOIO:
- result = vc_handle_ioio(ghcb, ctxt);
- break;
- case SVM_EXIT_MSR:
- result = vc_handle_msr(ghcb, ctxt);
- break;
- case SVM_EXIT_VMMCALL:
- result = vc_handle_vmmcall(ghcb, ctxt);
- break;
- case SVM_EXIT_WBINVD:
- result = vc_handle_wbinvd(ghcb, ctxt);
- break;
- case SVM_EXIT_MONITOR:
- result = vc_handle_monitor(ghcb, ctxt);
- break;
- case SVM_EXIT_MWAIT:
- result = vc_handle_mwait(ghcb, ctxt);
- break;
- case SVM_EXIT_NPF:
- result = vc_handle_mmio(ghcb, ctxt);
- break;
- default:
- /*
- * Unexpected #VC exception
- */
- result = ES_UNSUPPORTED;
- }
-
- return result;
-}
-
-static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
-{
- long error_code = ctxt->fi.error_code;
- int trapnr = ctxt->fi.vector;
-
- ctxt->regs->orig_ax = ctxt->fi.error_code;
-
- switch (trapnr) {
- case X86_TRAP_GP:
- exc_general_protection(ctxt->regs, error_code);
- break;
- case X86_TRAP_UD:
- exc_invalid_op(ctxt->regs);
- break;
- case X86_TRAP_PF:
- write_cr2(ctxt->fi.cr2);
- exc_page_fault(ctxt->regs, error_code);
- break;
- case X86_TRAP_AC:
- exc_alignment_check(ctxt->regs, error_code);
- break;
- default:
- pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
- BUG();
- }
-}
-
-static __always_inline bool is_vc2_stack(unsigned long sp)
-{
- return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
-}
-
-static __always_inline bool vc_from_invalid_context(struct pt_regs *regs)
-{
- unsigned long sp, prev_sp;
-
- sp = (unsigned long)regs;
- prev_sp = regs->sp;
-
- /*
- * If the code was already executing on the VC2 stack when the #VC
- * happened, let it proceed to the normal handling routine. This way the
- * code executing on the VC2 stack can cause #VC exceptions to get handled.
- */
- return is_vc2_stack(sp) && !is_vc2_stack(prev_sp);
-}
-
-static bool vc_raw_handle_exception(struct pt_regs *regs, unsigned long error_code)
-{
- struct ghcb_state state;
- struct es_em_ctxt ctxt;
- enum es_result result;
- struct ghcb *ghcb;
- bool ret = true;
-
- ghcb = __sev_get_ghcb(&state);
-
- vc_ghcb_invalidate(ghcb);
- result = vc_init_em_ctxt(&ctxt, regs, error_code);
-
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, ghcb, error_code);
-
- __sev_put_ghcb(&state);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- pr_err_ratelimited("Unsupported exit-code 0x%02lx in #VC exception (IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_VMM_ERROR:
- pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_DECODE_FAILED:
- pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- error_code, regs->ip);
- ret = false;
- break;
- case ES_EXCEPTION:
- vc_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- pr_emerg("Unknown result in %s():%d\n", __func__, result);
- /*
- * Emulating the instruction which caused the #VC exception
- * failed - can't continue so print debug information
- */
- BUG();
- }
-
- return ret;
-}
-
-static __always_inline bool vc_is_db(unsigned long error_code)
-{
- return error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB;
-}
-
-/*
- * Runtime #VC exception handler when raised from kernel mode. Runs in NMI mode
- * and will panic when an error happens.
- */
-DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication)
-{
- irqentry_state_t irq_state;
-
- /*
- * With the current implementation it is always possible to switch to a
- * safe stack because #VC exceptions only happen at known places, like
- * intercepted instructions or accesses to MMIO areas/IO ports. They can
- * also happen with code instrumentation when the hypervisor intercepts
- * #DB, but the critical paths are forbidden to be instrumented, so #DB
- * exceptions currently also only happen in safe places.
- *
- * But keep this here in case the noinstr annotations are violated due
- * to bug elsewhere.
- */
- if (unlikely(vc_from_invalid_context(regs))) {
- instrumentation_begin();
- panic("Can't handle #VC exception from unsupported context\n");
- instrumentation_end();
- }
-
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (vc_is_db(error_code)) {
- exc_debug(regs);
- return;
- }
-
- irq_state = irqentry_nmi_enter(regs);
-
- instrumentation_begin();
-
- if (!vc_raw_handle_exception(regs, error_code)) {
- /* Show some debug info */
- show_regs(regs);
-
- /* Ask hypervisor to sev_es_terminate */
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-
- /* If that fails and we get here - just panic */
- panic("Returned from Terminate-Request to Hypervisor\n");
- }
-
- instrumentation_end();
- irqentry_nmi_exit(regs, irq_state);
-}
-
-/*
- * Runtime #VC exception handler when raised from user mode. Runs in IRQ mode
- * and will kill the current task with SIGBUS when an error happens.
- */
-DEFINE_IDTENTRY_VC_USER(exc_vmm_communication)
-{
- /*
- * Handle #DB before calling into !noinstr code to avoid recursive #DB.
- */
- if (vc_is_db(error_code)) {
- noist_exc_debug(regs);
- return;
- }
-
- irqentry_enter_from_user_mode(regs);
- instrumentation_begin();
-
- if (!vc_raw_handle_exception(regs, error_code)) {
- /*
- * Do not kill the machine if user-space triggered the
- * exception. Send SIGBUS instead and let user-space deal with
- * it.
- */
- force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
- }
-
- instrumentation_end();
- irqentry_exit_to_user_mode(regs);
-}
-
-bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
-{
- unsigned long exit_code = regs->orig_ax;
- struct es_em_ctxt ctxt;
- enum es_result result;
-
- vc_ghcb_invalidate(boot_ghcb);
-
- result = vc_init_em_ctxt(&ctxt, regs, exit_code);
- if (result == ES_OK)
- result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
-
- /* Done - now check the result */
- switch (result) {
- case ES_OK:
- vc_finish_insn(&ctxt);
- break;
- case ES_UNSUPPORTED:
- early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_VMM_ERROR:
- early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_DECODE_FAILED:
- early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
- exit_code, regs->ip);
- goto fail;
- case ES_EXCEPTION:
- vc_early_forward_exception(&ctxt);
- break;
- case ES_RETRY:
- /* Nothing to do */
- break;
- default:
- BUG();
- }
-
- return true;
-
-fail:
- show_regs(regs);
-
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ);
-}
-
-/*
- * Initial set up of SNP relies on information provided by the
- * Confidential Computing blob, which can be passed to the kernel
- * in the following ways, depending on how it is booted:
- *
- * - when booted via the boot/decompress kernel:
- * - via boot_params
- *
- * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH):
- * - via a setup_data entry, as defined by the Linux Boot Protocol
- *
- * Scan for the blob in that order.
- */
-static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- /* Boot kernel would have passed the CC blob via boot_params. */
- if (bp->cc_blob_address) {
- cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address;
- goto found_cc_info;
- }
-
- /*
- * If kernel was booted directly, without the use of the
- * boot/decompression kernel, the CC blob may have been passed via
- * setup_data instead.
- */
- cc_info = find_cc_blob_setup_data(bp);
- if (!cc_info)
- return NULL;
-
-found_cc_info:
- if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC)
- snp_abort();
-
- return cc_info;
-}
-
-bool __init snp_init(struct boot_params *bp)
-{
- struct cc_blob_sev_info *cc_info;
-
- if (!bp)
- return false;
-
- cc_info = find_cc_blob(bp);
- if (!cc_info)
- return false;
-
- setup_cpuid_table(cc_info);
-
- /*
- * The CC blob will be used later to access the secrets page. Cache
- * it here like the boot kernel does.
- */
- bp->cc_blob_address = (u32)(unsigned long)cc_info;
-
- return true;
-}
-
-void __init __noreturn snp_abort(void)
-{
- sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
-}
-
-static void dump_cpuid_table(void)
-{
- const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
- int i = 0;
-
- pr_info("count=%d reserved=0x%x reserved2=0x%llx\n",
- cpuid_table->count, cpuid_table->__reserved1, cpuid_table->__reserved2);
-
- for (i = 0; i < SNP_CPUID_COUNT_MAX; i++) {
- const struct snp_cpuid_fn *fn = &cpuid_table->fn[i];
-
- pr_info("index=%3d fn=0x%08x subfn=0x%08x: eax=0x%08x ebx=0x%08x ecx=0x%08x edx=0x%08x xcr0_in=0x%016llx xss_in=0x%016llx reserved=0x%016llx\n",
- i, fn->eax_in, fn->ecx_in, fn->eax, fn->ebx, fn->ecx,
- fn->edx, fn->xcr0_in, fn->xss_in, fn->__reserved);
- }
-}
-
-/*
- * It is useful from an auditing/testing perspective to provide an easy way
- * for the guest owner to know that the CPUID table has been initialized as
- * expected, but that initialization happens too early in boot to print any
- * sort of indicator, and there's not really any other good place to do it,
- * so do it here.
- */
-static int __init report_cpuid_table(void)
-{
- const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
-
- if (!cpuid_table->count)
- return 0;
-
- pr_info("Using SNP CPUID table, %d entries present.\n",
- cpuid_table->count);
-
- if (sev_cfg.debug)
- dump_cpuid_table();
-
- return 0;
-}
-arch_initcall(report_cpuid_table);
-
-static int __init init_sev_config(char *str)
-{
- char *s;
-
- while ((s = strsep(&str, ","))) {
- if (!strcmp(s, "debug")) {
- sev_cfg.debug = true;
- continue;
- }
-
- pr_info("SEV command-line option '%s' was not recognized\n", s);
- }
-
- return 1;
-}
-__setup("sev=", init_sev_config);
-
-int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio)
-{
- struct ghcb_state state;
- struct es_em_ctxt ctxt;
- unsigned long flags;
- struct ghcb *ghcb;
- int ret;
-
- rio->exitinfo2 = SEV_RET_NO_FW_CALL;
-
- /*
- * __sev_get_ghcb() needs to run with IRQs disabled because it is using
- * a per-CPU GHCB.
- */
- local_irq_save(flags);
-
- ghcb = __sev_get_ghcb(&state);
- if (!ghcb) {
- ret = -EIO;
- goto e_restore_irq;
- }
-
- vc_ghcb_invalidate(ghcb);
-
- if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
- ghcb_set_rax(ghcb, input->data_gpa);
- ghcb_set_rbx(ghcb, input->data_npages);
- }
-
- ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
- if (ret)
- goto e_put;
-
- rio->exitinfo2 = ghcb->save.sw_exit_info_2;
- switch (rio->exitinfo2) {
- case 0:
- break;
-
- case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_BUSY):
- ret = -EAGAIN;
- break;
-
- case SNP_GUEST_VMM_ERR(SNP_GUEST_VMM_ERR_INVALID_LEN):
- /* Number of expected pages are returned in RBX */
- if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) {
- input->data_npages = ghcb_get_rbx(ghcb);
- ret = -ENOSPC;
- break;
- }
- fallthrough;
- default:
- ret = -EIO;
- break;
- }
-
-e_put:
- __sev_put_ghcb(&state);
-e_restore_irq:
- local_irq_restore(flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(snp_issue_guest_request);
-
-static struct platform_device sev_guest_device = {
- .name = "sev-guest",
- .id = -1,
-};
-
-static int __init snp_init_platform_device(void)
-{
- struct sev_guest_platform_data data;
- u64 gpa;
-
- if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
- return -ENODEV;
-
- gpa = get_secrets_page();
- if (!gpa)
- return -ENODEV;
-
- data.secrets_gpa = gpa;
- if (platform_device_add_data(&sev_guest_device, &data, sizeof(data)))
- return -ENODEV;
-
- if (platform_device_register(&sev_guest_device))
- return -ENODEV;
-
- pr_info("SNP guest platform device initialized.\n");
- return 0;
-}
-device_initcall(snp_init_platform_device);
diff --git a/arch/x86/kernel/sev_verify_cbit.S b/arch/x86/kernel/sev_verify_cbit.S
index 3355e27c69eb..1ab65f6c6ae7 100644
--- a/arch/x86/kernel/sev_verify_cbit.S
+++ b/arch/x86/kernel/sev_verify_cbit.S
@@ -77,7 +77,7 @@ SYM_FUNC_START(sev_verify_cbit)
* The check failed, prevent any forward progress to prevent ROP
* attacks, invalidate the stack and go into a hlt loop.
*/
- xorq %rsp, %rsp
+ xorl %esp, %esp
subq $0x1000, %rsp
2: hlt
jmp 2b
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 59e15dd8d0f8..059685612362 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -163,8 +163,8 @@ static int shstk_setup(void)
if (features_enabled(ARCH_SHSTK_SHSTK))
return 0;
- /* Also not supported for 32 bit and x32 */
- if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_32bit_syscall())
+ /* Also not supported for 32 bit */
+ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
return -EOPNOTSUPP;
size = adjust_shstk_size(0);
@@ -577,3 +577,19 @@ long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
return wrss_control(true);
return -EINVAL;
}
+
+int shstk_update_last_frame(unsigned long val)
+{
+ unsigned long ssp;
+
+ if (!features_enabled(ARCH_SHSTK_SHSTK))
+ return 0;
+
+ ssp = get_user_shstk_addr();
+ return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
+
+bool shstk_is_enabled(void)
+{
+ return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 65fe2094da59..5f441039b572 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
+#include <linux/rseq.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -60,6 +61,24 @@ static inline int is_x32_frame(struct ksignal *ksig)
}
/*
+ * Enable all pkeys temporarily, so as to ensure that both the current
+ * execution stack as well as the alternate signal stack are writeable.
+ * The application can use any of the available pkeys to protect the
+ * alternate signal stack, and we don't know which one it is, so enable
+ * all. The PKRU register will be reset to init_pkru later in the flow,
+ * in fpu__clear_user_states(), and it is the application's responsibility
+ * to enable the appropriate pkey as the first step in the signal handler
+ * so that the handler does not segfault.
+ */
+static inline u32 sig_prepare_pkru(void)
+{
+ u32 orig_pkru = read_pkru();
+
+ write_pkru(0);
+ return orig_pkru;
+}
+
+/*
* Set up a signal frame.
*/
@@ -83,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
unsigned long math_size = 0;
unsigned long sp = regs->sp;
unsigned long buf_fx = 0;
+ u32 pkru;
/* redzone */
if (!ia32_frame)
@@ -137,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
return (void __user *)-1L;
}
+ /* Update PKRU to enable access to the alternate signal stack. */
+ pkru = sig_prepare_pkru();
/* save i387 and extended state */
- if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
+ if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
+ /*
+ * Restore PKRU to the original, user-defined value; disable
+ * extra pkeys enabled for the alternate signal stack, if any.
+ */
+ write_pkru(pkru);
return (void __user *)-1L;
+ }
return (void __user *)sp;
}
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index c12624bc82a3..ef654530bf5a 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -34,7 +34,7 @@
#include <asm/gsseg.h>
#ifdef CONFIG_IA32_EMULATION
-#include <asm/ia32_unistd.h>
+#include <asm/unistd_32_ia32.h>
static inline void reload_segments(struct sigcontext_32 *sc)
{
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 23d8aaf8d9fd..ee9453891901 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -260,13 +260,13 @@ SYSCALL_DEFINE0(rt_sigreturn)
set_current_blocked(&set);
- if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
+ if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- if (restore_signal_shadow_stack())
+ if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
- if (restore_altstack(&frame->uc.uc_stack))
+ if (restore_signal_shadow_stack())
goto badframe;
return regs->ax;
@@ -315,6 +315,9 @@ int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
uc_flags = frame_uc_flags(regs);
+ if (setup_signal_shadow_stack(ksig))
+ return -EFAULT;
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -377,6 +380,9 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
if (!restore_sigcontext(regs, &frame->uc.uc_mcontext, uc_flags))
goto badframe;
+ if (restore_signal_shadow_stack())
+ goto badframe;
+
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 96a771f9f930..18266cc3d98c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -148,14 +148,16 @@ static int register_stop_handler(void)
static void native_stop_other_cpus(int wait)
{
- unsigned int cpu = smp_processor_id();
+ unsigned int old_cpu, this_cpu;
unsigned long flags, timeout;
if (reboot_force)
return;
/* Only proceed if this is the first CPU to reach this code */
- if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
+ old_cpu = -1;
+ this_cpu = smp_processor_id();
+ if (!atomic_try_cmpxchg(&stopping_cpu, &old_cpu, this_cpu))
return;
/* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
@@ -186,7 +188,7 @@ static void native_stop_other_cpus(int wait)
* NMIs.
*/
cpumask_copy(&cpus_stop_mask, cpu_online_mask);
- cpumask_clear_cpu(cpu, &cpus_stop_mask);
+ cpumask_clear_cpu(this_cpu, &cpus_stop_mask);
if (!cpumask_empty(&cpus_stop_mask)) {
apic_send_IPI_allbutself(REBOOT_VECTOR);
@@ -210,6 +212,8 @@ static void native_stop_other_cpus(int wait)
* CPUs to stop.
*/
if (!smp_no_nmi_ipi && !register_stop_handler()) {
+ unsigned int cpu;
+
pr_emerg("Shutting down cpus with NMI\n");
for_each_cpu(cpu, &cpus_stop_mask)
@@ -282,7 +286,7 @@ struct smp_ops smp_ops = {
.smp_cpus_done = native_smp_cpus_done,
.stop_other_cpus = native_stop_other_cpus,
-#if defined(CONFIG_KEXEC_CORE)
+#if defined(CONFIG_CRASH_DUMP)
.crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
#endif
.smp_send_reschedule = native_smp_send_reschedule,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2cc2aa120b4b..c10850ae6f09 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -60,9 +60,11 @@
#include <linux/stackprotector.h>
#include <linux/cpuhotplug.h>
#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
#include <asm/cacheinfo.h>
+#include <asm/cpuid.h>
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
@@ -101,10 +103,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map);
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
EXPORT_PER_CPU_SYMBOL(cpu_die_map);
-/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
-EXPORT_PER_CPU_SYMBOL(cpu_info);
-
/* CPUs which are the primary SMT threads */
struct cpumask __cpu_primary_thread_mask __read_mostly;
@@ -125,25 +123,6 @@ struct mwait_cpu_dead {
*/
static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
-/* Logical package management. */
-struct logical_maps {
- u32 phys_pkg_id;
- u32 phys_die_id;
- u32 logical_pkg_id;
- u32 logical_die_id;
-};
-
-/* Temporary workaround until the full topology mechanics is in place */
-static DEFINE_PER_CPU_READ_MOSTLY(struct logical_maps, logical_maps) = {
- .phys_pkg_id = U32_MAX,
- .phys_die_id = U32_MAX,
-};
-
-unsigned int __max_logical_packages __read_mostly;
-EXPORT_SYMBOL(__max_logical_packages);
-static unsigned int logical_packages __read_mostly;
-static unsigned int logical_die __read_mostly;
-
/* Maximum number of SMT threads on any online core */
int __read_mostly __max_smt_threads = 1;
@@ -269,7 +248,7 @@ static void notrace start_secondary(void *unused)
__flush_tlb_all();
}
- cpu_init_exception_handling();
+ cpu_init_exception_handling(false);
/*
* Load the microcode before reaching the AP alive synchronization
@@ -336,106 +315,6 @@ static void notrace start_secondary(void *unused)
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-/**
- * topology_phys_to_logical_pkg - Map a physical package id to a logical
- * @phys_pkg: The physical package id to map
- *
- * Returns logical package id or -1 if not found
- */
-int topology_phys_to_logical_pkg(unsigned int phys_pkg)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- if (per_cpu(logical_maps.phys_pkg_id, cpu) == phys_pkg)
- return per_cpu(logical_maps.logical_pkg_id, cpu);
- }
- return -1;
-}
-EXPORT_SYMBOL(topology_phys_to_logical_pkg);
-
-/**
- * topology_phys_to_logical_die - Map a physical die id to logical
- * @die_id: The physical die id to map
- * @cur_cpu: The CPU for which the mapping is done
- *
- * Returns logical die id or -1 if not found
- */
-static int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
-{
- int cpu, proc_id = cpu_data(cur_cpu).topo.pkg_id;
-
- for_each_possible_cpu(cpu) {
- if (per_cpu(logical_maps.phys_pkg_id, cpu) == proc_id &&
- per_cpu(logical_maps.phys_die_id, cpu) == die_id)
- return per_cpu(logical_maps.logical_die_id, cpu);
- }
- return -1;
-}
-
-/**
- * topology_update_package_map - Update the physical to logical package map
- * @pkg: The physical package id as retrieved via CPUID
- * @cpu: The cpu for which this is updated
- */
-int topology_update_package_map(unsigned int pkg, unsigned int cpu)
-{
- int new;
-
- /* Already available somewhere? */
- new = topology_phys_to_logical_pkg(pkg);
- if (new >= 0)
- goto found;
-
- new = logical_packages++;
- if (new != pkg) {
- pr_info("CPU %u Converting physical %u to logical package %u\n",
- cpu, pkg, new);
- }
-found:
- per_cpu(logical_maps.phys_pkg_id, cpu) = pkg;
- per_cpu(logical_maps.logical_pkg_id, cpu) = new;
- cpu_data(cpu).topo.logical_pkg_id = new;
- return 0;
-}
-/**
- * topology_update_die_map - Update the physical to logical die map
- * @die: The die id as retrieved via CPUID
- * @cpu: The cpu for which this is updated
- */
-int topology_update_die_map(unsigned int die, unsigned int cpu)
-{
- int new;
-
- /* Already available somewhere? */
- new = topology_phys_to_logical_die(die, cpu);
- if (new >= 0)
- goto found;
-
- new = logical_die++;
- if (new != die) {
- pr_info("CPU %u Converting physical %u to logical die %u\n",
- cpu, die, new);
- }
-found:
- per_cpu(logical_maps.phys_die_id, cpu) = die;
- per_cpu(logical_maps.logical_die_id, cpu) = new;
- cpu_data(cpu).topo.logical_die_id = new;
- return 0;
-}
-
-static void __init smp_store_boot_cpu_info(void)
-{
- int id = 0; /* CPU 0 */
- struct cpuinfo_x86 *c = &cpu_data(id);
-
- *c = boot_cpu_data;
- c->cpu_index = id;
- topology_update_package_map(c->topo.pkg_id, id);
- topology_update_die_map(c->topo.die_id, id);
- c->initialized = true;
-}
-
/*
* The bootstrap kernel entry code has set these up. Save them for
* a given CPU
@@ -488,6 +367,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
if (c->topo.pkg_id == o->topo.pkg_id &&
c->topo.die_id == o->topo.die_id &&
+ c->topo.amd_node_id == o->topo.amd_node_id &&
per_cpu_llc_id(cpu1) == per_cpu_llc_id(cpu2)) {
if (c->topo.core_id == o->topo.core_id)
return topology_sane(c, o, "smt");
@@ -509,10 +389,13 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->topo.pkg_id == o->topo.pkg_id &&
- c->topo.die_id == o->topo.die_id)
- return true;
- return false;
+ if (c->topo.pkg_id != o->topo.pkg_id || c->topo.die_id != o->topo.die_id)
+ return false;
+
+ if (cpu_feature_enabled(X86_FEATURE_TOPOEXT) && topology_amd_nodes_per_pkg() > 1)
+ return c->topo.amd_node_id == o->topo.amd_node_id;
+
+ return true;
}
static bool match_l2c(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -557,9 +440,9 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
*/
static const struct x86_cpu_id intel_cod_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
- X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
+ X86_MATCH_VFM(INTEL_HASWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_BROADWELL_X, 0), /* COD */
+ X86_MATCH_VFM(INTEL_ANY, 1), /* SNC */
{}
};
@@ -600,12 +483,6 @@ static int x86_core_flags(void)
return cpu_core_flags() | x86_sched_itmt_flags();
}
#endif
-#ifdef CONFIG_SCHED_SMT
-static int x86_smt_flags(void)
-{
- return cpu_smt_flags();
-}
-#endif
#ifdef CONFIG_SCHED_CLUSTER
static int x86_cluster_flags(void)
{
@@ -613,14 +490,6 @@ static int x86_cluster_flags(void)
}
#endif
-static int x86_die_flags(void)
-{
- if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
- return x86_sched_itmt_flags();
-
- return 0;
-}
-
/*
* Set if a package/die has multiple NUMA nodes inside.
* AMD Magny-Cours, Intel Cluster-on-Die, and Intel
@@ -636,7 +505,7 @@ static void __init build_sched_topology(void)
#ifdef CONFIG_SCHED_SMT
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
};
#endif
#ifdef CONFIG_SCHED_CLUSTER
@@ -656,7 +525,7 @@ static void __init build_sched_topology(void)
*/
if (!x86_has_numa_in_package) {
x86_topology[i++] = (struct sched_domain_topology_level){
- cpu_cpu_mask, x86_die_flags, SD_INIT_NAME(PKG)
+ cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG)
};
}
@@ -670,8 +539,8 @@ static void __init build_sched_topology(void)
void set_cpu_sibling_map(int cpu)
{
- bool has_smt = smp_num_siblings > 1;
- bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1;
+ bool has_smt = __max_threads_per_core > 1;
+ bool has_mp = has_smt || topology_num_cores_per_package() > 1;
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct cpuinfo_x86 *o;
int i, threads;
@@ -757,6 +626,7 @@ const struct cpumask *cpu_clustergroup_mask(int cpu)
{
return cpu_l2c_shared_mask(cpu);
}
+EXPORT_SYMBOL_GPL(cpu_clustergroup_mask);
static void impress_friends(void)
{
@@ -1067,9 +937,13 @@ int native_kick_ap(unsigned int cpu, struct task_struct *tidle)
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- if (apicid == BAD_APICID || !physid_isset(apicid, phys_cpu_present_map) ||
- !apic_id_valid(apicid)) {
- pr_err("%s: bad cpu %d\n", __func__, cpu);
+ if (apicid == BAD_APICID || !apic_id_valid(apicid)) {
+ pr_err("CPU %u has invalid APIC ID %x. Aborting bringup\n", cpu, apicid);
+ return -EINVAL;
+ }
+
+ if (!test_bit(apicid, phys_cpu_present_map)) {
+ pr_err("CPU %u APIC ID %x is not present. Aborting bringup\n", cpu, apicid);
return -EINVAL;
}
@@ -1138,54 +1012,41 @@ static __init void disable_smp(void)
pr_info("SMP disabled\n");
disable_ioapic_support();
+ topology_reset_possible_cpus_up();
- init_cpu_present(cpumask_of(0));
- init_cpu_possible(cpumask_of(0));
-
- if (smp_found_config)
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- else
- physid_set_mask_of_physid(0, &phys_cpu_present_map);
cpumask_set_cpu(0, topology_sibling_cpumask(0));
cpumask_set_cpu(0, topology_core_cpumask(0));
cpumask_set_cpu(0, topology_die_cpumask(0));
}
-static void __init smp_cpu_index_default(void)
-{
- int i;
- struct cpuinfo_x86 *c;
-
- for_each_possible_cpu(i) {
- c = &cpu_data(i);
- /* mark all to hotplug */
- c->cpu_index = nr_cpu_ids;
- }
-}
-
void __init smp_prepare_cpus_common(void)
{
- unsigned int i;
+ unsigned int cpu, node;
- smp_cpu_index_default();
+ /* Mark all except the boot CPU as hotpluggable */
+ for_each_possible_cpu(cpu) {
+ if (cpu)
+ per_cpu(cpu_info.cpu_index, cpu) = nr_cpu_ids;
+ }
- /*
- * Setup boot CPU information
- */
- smp_store_boot_cpu_info(); /* Final full version of the data */
- mb();
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
- for_each_possible_cpu(i) {
- zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
- zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
+ zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_die_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_llc_shared_map, cpu), GFP_KERNEL, node);
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2c_shared_map, cpu), GFP_KERNEL, node);
}
set_cpu_sibling_map(0);
}
+void __init smp_prepare_boot_cpu(void)
+{
+ smp_ops.smp_prepare_boot_cpu();
+}
+
#ifdef CONFIG_X86_64
/* Establish whether parallel bringup can be supported. */
bool __init arch_cpuhp_init_parallel_bringup(void)
@@ -1264,102 +1125,16 @@ void __init native_smp_prepare_boot_cpu(void)
native_pv_lock_init();
}
-void __init calculate_max_logical_packages(void)
-{
- int ncpus;
-
- /*
- * Today neither Intel nor AMD support heterogeneous systems so
- * extrapolate the boot cpu's data to all packages.
- */
- ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
- __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
- pr_info("Max logical packages: %u\n", __max_logical_packages);
-}
-
void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
- calculate_max_logical_packages();
build_sched_topology();
nmi_selftest();
impress_friends();
cache_aps_init();
}
-static int __initdata setup_possible_cpus = -1;
-static int __init _setup_possible_cpus(char *str)
-{
- get_option(&str, &setup_possible_cpus);
- return 0;
-}
-early_param("possible_cpus", _setup_possible_cpus);
-
-
-/*
- * cpu_possible_mask should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and don't expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_mask on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with possible_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- * We do this because additional CPUs waste a lot of memory.
- * -AK
- */
-__init void prefill_possible_map(void)
-{
- int i, possible;
-
- i = setup_max_cpus ?: 1;
- if (setup_possible_cpus == -1) {
- possible = num_processors;
-#ifdef CONFIG_HOTPLUG_CPU
- if (setup_max_cpus)
- possible += disabled_cpus;
-#else
- if (possible > i)
- possible = i;
-#endif
- } else
- possible = setup_possible_cpus;
-
- total_cpus = max_t(int, possible, num_processors + disabled_cpus);
-
- /* nr_cpu_ids could be reduced via nr_cpus= */
- if (possible > nr_cpu_ids) {
- pr_warn("%d Processors exceeds NR_CPUS limit of %u\n",
- possible, nr_cpu_ids);
- possible = nr_cpu_ids;
- }
-
-#ifdef CONFIG_HOTPLUG_CPU
- if (!setup_max_cpus)
-#endif
- if (possible > i) {
- pr_warn("%d Processors exceeds max_cpus limit of %u\n",
- possible, setup_max_cpus);
- possible = i;
- }
-
- set_nr_cpu_ids(possible);
-
- pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
- possible, max_t(int, possible - num_processors, 0));
-
- reset_cpu_possible_mask();
-
- for (i = 0; i < possible; i++)
- set_cpu_possible(i, true);
-}
-
/* correctly size the local cpu masks */
void __init setup_cpu_local_masks(void)
{
@@ -1502,10 +1277,8 @@ static inline void mwait_play_dead(void)
return;
if (!this_cpu_has(X86_FEATURE_CLFLUSH))
return;
- if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
- return;
- eax = CPUID_MWAIT_LEAF;
+ eax = CPUID_LEAF_MWAIT;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 77a9316da435..9e51242ed125 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -172,7 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail)
}
EXPORT_SYMBOL_GPL(arch_static_call_transform);
-#ifdef CONFIG_RETHUNK
+noinstr void __static_call_update_early(void *tramp, void *func)
+{
+ BUG_ON(system_state != SYSTEM_BOOTING);
+ BUG_ON(static_call_initialized);
+ __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE);
+ sync_core();
+}
+
+#ifdef CONFIG_MITIGATION_RETHUNK
/*
* This is called by apply_returns() to fix up static call trampolines,
* specifically ARCH_DEFINE_STATIC_CALL_NULL_TRAMP which is recorded as
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 8e2b2552b5ee..3e2952679b88 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -6,7 +6,9 @@
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/ptrace.h>
+
#include <asm/desc.h>
+#include <asm/debugreg.h>
#include <asm/mmu_context.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c783aeb37dce..776ae6fa7f2d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -18,6 +18,7 @@
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/elf.h>
+#include <linux/hugetlb.h>
#include <asm/elf.h>
#include <asm/ia32.h>
@@ -25,8 +26,10 @@
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
*/
-static unsigned long get_align_mask(void)
+static unsigned long get_align_mask(struct file *filp)
{
+ if (filp && is_file_hugepages(filp))
+ return huge_page_mask_align(filp);
/* handle 32- and 64-bit case with a single conditional */
if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
return 0;
@@ -49,14 +52,7 @@ static unsigned long get_align_mask(void)
*/
static unsigned long get_align_bits(void)
{
- return va_align.bits & get_align_mask();
-}
-
-unsigned long align_vdso_addr(unsigned long addr)
-{
- unsigned long align_mask = get_align_mask();
- addr = (addr + align_mask) & ~align_mask;
- return addr | get_align_bits();
+ return va_align.bits & get_align_mask(NULL);
}
static int __init control_va_addr_alignment(char *str)
@@ -119,13 +115,21 @@ static void find_start_end(unsigned long addr, unsigned long flags,
*end = task_size_64bit(addr > DEFAULT_MAP_WINDOW);
}
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
unsigned long begin, end;
if (flags & MAP_FIXED)
@@ -144,28 +148,30 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
return addr;
}
- info.flags = 0;
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_mask = 0;
- info.align_offset = pgoff << PAGE_SHIFT;
+ if (!(filp && is_file_hugepages(filp))) {
+ info.align_offset = pgoff << PAGE_SHIFT;
+ info.start_gap = stack_guard_placement(vm_flags);
+ }
if (filp) {
- info.align_mask = get_align_mask();
+ info.align_mask = get_align_mask(filp);
info.align_offset += get_align_bits();
}
+
return vm_unmapped_area(&info);
}
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
- struct vm_unmapped_area_info info;
+ struct vm_unmapped_area_info info = {};
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -199,6 +205,10 @@ get_unmapped_area:
info.low_limit = PAGE_SIZE;
info.high_limit = get_mmap_base(0);
+ if (!(filp && is_file_hugepages(filp))) {
+ info.start_gap = stack_guard_placement(vm_flags);
+ info.align_offset = pgoff << PAGE_SHIFT;
+ }
/*
* If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area
@@ -210,10 +220,8 @@ get_unmapped_area:
if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall())
info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW;
- info.align_mask = 0;
- info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {
- info.align_mask = get_align_mask();
+ info.align_mask = get_align_mask(filp);
info.align_offset += get_align_bits();
}
addr = vm_unmapped_area(&info);
@@ -228,5 +236,5 @@ bottomup:
* can happen with large stack limits and large mmap()
* allocations.
*/
- return arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+ return arch_get_unmapped_area(filp, addr0, len, pgoff, flags, 0);
}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index e42faa792c07..52e1f3f0b361 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -27,25 +27,7 @@
unsigned long profile_pc(struct pt_regs *regs)
{
- unsigned long pc = instruction_pointer(regs);
-
- if (!user_mode(regs) && in_lock_functions(pc)) {
-#ifdef CONFIG_FRAME_POINTER
- return *(unsigned long *)(regs->bp + sizeof(long));
-#else
- unsigned long *sp = (unsigned long *)regs->sp;
- /*
- * Return address is either directly at stack pointer
- * or above a saved flags. Eflags has bits 22-31 zero,
- * kernel addresses don't.
- */
- if (sp[0] >> 22)
- return sp[0];
- if (sp[1] >> 22)
- return sp[1];
-#endif
- }
- return pc;
+ return instruction_pointer(regs);
}
EXPORT_SYMBOL(profile_pc);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
deleted file mode 100644
index 0bab03130033..000000000000
--- a/arch/x86/kernel/topology.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Populate sysfs with topology information
- *
- * Written by: Matthew Dobson, IBM Corporation
- * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- */
-#include <linux/interrupt.h>
-#include <linux/nodemask.h>
-#include <linux/export.h>
-#include <linux/mmzone.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/irq.h>
-#include <asm/io_apic.h>
-#include <asm/cpu.h>
-
-static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
-
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_register_cpu(int cpu)
-{
- struct x86_cpu *xc = per_cpu_ptr(&cpu_devices, cpu);
-
- xc->cpu.hotpluggable = cpu > 0;
- return register_cpu(&xc->cpu, cpu);
-}
-EXPORT_SYMBOL(arch_register_cpu);
-
-void arch_unregister_cpu(int num)
-{
- unregister_cpu(&per_cpu(cpu_devices, num).cpu);
-}
-EXPORT_SYMBOL(arch_unregister_cpu);
-#else /* CONFIG_HOTPLUG_CPU */
-
-int __init arch_register_cpu(int num)
-{
- return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-static int __init topology_init(void)
-{
- int i;
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
-
- return 0;
-}
-subsys_initcall(topology_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c876f1d36a81..2dbadf347b5f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -37,10 +37,12 @@
#include <linux/nmi.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <linux/iommu.h>
+#include <linux/ubsan.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
@@ -50,6 +52,7 @@
#include <asm/ftrace.h>
#include <asm/traps.h>
#include <asm/desc.h>
+#include <asm/fred.h>
#include <asm/fpu/api.h>
#include <asm/cpu.h>
#include <asm/cpu_entry_area.h>
@@ -89,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr)
return *(unsigned short *)addr == INSN_UD2;
}
+/*
+ * Check for UD1 or UD2, accounting for Address Size Override Prefixes.
+ * If it's a UD1, get the ModRM byte to pass along to UBSan.
+ */
+__always_inline int decode_bug(unsigned long addr, u32 *imm)
+{
+ u8 v;
+
+ if (addr < TASK_SIZE_MAX)
+ return BUG_NONE;
+
+ v = *(u8 *)(addr++);
+ if (v == INSN_ASOP)
+ v = *(u8 *)(addr++);
+ if (v != OPCODE_ESCAPE)
+ return BUG_NONE;
+
+ v = *(u8 *)(addr++);
+ if (v == SECOND_BYTE_OPCODE_UD2)
+ return BUG_UD2;
+
+ if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1)
+ return BUG_NONE;
+
+ /* Retrieve the immediate (type value) for the UBSAN UD1 */
+ v = *(u8 *)(addr++);
+ if (X86_MODRM_RM(v) == 4)
+ addr++;
+
+ *imm = 0;
+ if (X86_MODRM_MOD(v) == 1)
+ *imm = *(u8 *)addr;
+ else if (X86_MODRM_MOD(v) == 2)
+ *imm = *(u32 *)addr;
+ else
+ WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v));
+
+ return BUG_UD1;
+}
+
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
struct pt_regs *regs, long error_code)
@@ -214,14 +258,11 @@ static inline void handle_invalid_op(struct pt_regs *regs)
static noinstr bool handle_bug(struct pt_regs *regs)
{
bool handled = false;
+ int ud_type;
+ u32 imm;
- /*
- * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
- * is a rare case that uses @regs without passing them to
- * irqentry_enter().
- */
- kmsan_unpoison_entry_regs(regs);
- if (!is_valid_bugaddr(regs->ip))
+ ud_type = decode_bug(regs->ip, &imm);
+ if (ud_type == BUG_NONE)
return handled;
/*
@@ -229,15 +270,25 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
instrumentation_begin();
/*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
+ /*
* Since we're emulating a CALL with exceptions, restore the interrupt
* state to what it was at the exception site.
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
- handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
- regs->ip += LEN_UD2;
- handled = true;
+ if (ud_type == BUG_UD2) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+ handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
+ regs->ip += LEN_UD2;
+ handled = true;
+ }
+ } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) {
+ pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip);
}
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_disable();
@@ -565,7 +616,7 @@ static bool fixup_iopl_exception(struct pt_regs *regs)
*/
static bool try_fixup_enqcmd_gp(void)
{
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
u32 pasid;
/*
@@ -591,7 +642,7 @@ static bool try_fixup_enqcmd_gp(void)
if (!mm_valid_pasid(current->mm))
return false;
- pasid = current->mm->pasid;
+ pasid = mm_get_enqcmd_pasid(current->mm);
/*
* Did this thread already have its PASID activated?
@@ -772,7 +823,7 @@ DEFINE_IDTENTRY_RAW(exc_int3)
*/
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = (struct pt_regs *)this_cpu_read(pcpu_hot.top_of_stack) - 1;
+ struct pt_regs *regs = (struct pt_regs *)current_top_of_stack() - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
@@ -790,7 +841,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r
* trust it and switch to the current kernel stack
*/
if (ip_within_syscall_gap(regs)) {
- sp = this_cpu_read(pcpu_hot.top_of_stack);
+ sp = current_top_of_stack();
goto sync;
}
@@ -934,8 +985,7 @@ static bool notify_debug(struct pt_regs *regs, unsigned long *dr6)
return false;
}
-static __always_inline void exc_debug_kernel(struct pt_regs *regs,
- unsigned long dr6)
+static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6)
{
/*
* Disable breakpoints during exception handling; recursive exceptions
@@ -947,6 +997,11 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
+ *
+ * For FRED, nested #DB should just work fine. But when a watchpoint or
+ * breakpoint is set in the code path which is executed by #DB handler,
+ * it results in an endless recursion and stack overflow. Thus we stay
+ * with the IDT approach, i.e., save DR7 and disable #DB.
*/
unsigned long dr7 = local_db_save();
irqentry_state_t irq_state = irqentry_nmi_enter(regs);
@@ -976,7 +1031,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
* watchpoint at the same time then that will still be handled.
*/
- if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
+ if (!cpu_feature_enabled(X86_FEATURE_FRED) &&
+ (dr6 & DR_STEP) && is_sysenter_singlestep(regs))
dr6 &= ~DR_STEP;
/*
@@ -1008,8 +1064,7 @@ out:
local_db_restore(dr7);
}
-static __always_inline void exc_debug_user(struct pt_regs *regs,
- unsigned long dr6)
+static noinstr void exc_debug_user(struct pt_regs *regs, unsigned long dr6)
{
bool icebp;
@@ -1093,6 +1148,34 @@ DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
exc_debug_user(regs, debug_read_clear_dr6());
}
+
+#ifdef CONFIG_X86_FRED
+/*
+ * When occurred on different ring level, i.e., from user or kernel
+ * context, #DB needs to be handled on different stack: User #DB on
+ * current task stack, while kernel #DB on a dedicated stack.
+ *
+ * This is exactly how FRED event delivery invokes an exception
+ * handler: ring 3 event on level 0 stack, i.e., current task stack;
+ * ring 0 event on the #DB dedicated stack specified in the
+ * IA32_FRED_STKLVLS MSR. So unlike IDT, the FRED debug exception
+ * entry stub doesn't do stack switch.
+ */
+DEFINE_FREDENTRY_DEBUG(exc_debug)
+{
+ /*
+ * FRED #DB stores DR6 on the stack in the format which
+ * debug_read_clear_dr6() returns for the IDT entry points.
+ */
+ unsigned long dr6 = fred_event_data(regs);
+
+ if (user_mode(regs))
+ exc_debug_user(regs, dr6);
+ else
+ exc_debug_kernel(regs, dr6);
+}
+#endif /* CONFIG_X86_FRED */
+
#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_RAW(exc_debug)
@@ -1377,8 +1460,11 @@ void __init trap_init(void)
sev_es_init_vc_handling();
/* Initialize TSS before setting up traps so ISTs work */
- cpu_init_exception_handling();
+ cpu_init_exception_handling(true);
+
/* Setup traps as cpu_init() might #GP */
- idt_setup_traps();
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_traps();
+
cpu_init();
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 15f97c0abc9d..34dec0b72ea8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -16,6 +16,7 @@
#include <linux/static_key.h>
#include <linux/static_call.h>
+#include <asm/cpuid.h>
#include <asm/hpet.h>
#include <asm/timer.h>
#include <asm/vgtod.h>
@@ -26,9 +27,11 @@
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
-#include <asm/intel-family.h>
+#include <asm/cpu_device_id.h>
#include <asm/i8259.h>
+#include <asm/topology.h>
#include <asm/uv/uv.h>
+#include <asm/sev.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -44,16 +47,16 @@ EXPORT_SYMBOL(tsc_khz);
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
-static DEFINE_STATIC_KEY_FALSE(__use_tsc);
+static DEFINE_STATIC_KEY_FALSE_RO(__use_tsc);
int tsc_clocksource_reliable;
static int __read_mostly tsc_force_recalibrate;
-static u32 art_to_tsc_numerator;
-static u32 art_to_tsc_denominator;
-static u64 art_to_tsc_offset;
-static struct clocksource *art_related_clocksource;
+static struct clocksource_base art_base_clk = {
+ .id = CSID_X86_ART,
+};
+static bool have_art;
struct cyc2ns {
struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
@@ -173,10 +176,11 @@ static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long ts
c2n = per_cpu_ptr(&cyc2ns, cpu);
- raw_write_seqcount_latch(&c2n->seq);
+ write_seqcount_latch_begin(&c2n->seq);
c2n->data[0] = data;
- raw_write_seqcount_latch(&c2n->seq);
+ write_seqcount_latch(&c2n->seq);
c2n->data[1] = data;
+ write_seqcount_latch_end(&c2n->seq);
}
static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
@@ -652,7 +656,7 @@ success:
}
/**
- * native_calibrate_tsc
+ * native_calibrate_tsc - determine TSC frequency
* Determine TSC frequency via CPUID, else return 0.
*/
unsigned long native_calibrate_tsc(void)
@@ -663,13 +667,13 @@ unsigned long native_calibrate_tsc(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < 0x15)
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
return 0;
eax_denominator = ebx_numerator = ecx_hz = edx = 0;
/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
- cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
+ cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
if (ebx_numerator == 0 || eax_denominator == 0)
return 0;
@@ -678,11 +682,11 @@ unsigned long native_calibrate_tsc(void)
/*
* Denverton SoCs don't report crystal clock, and also don't support
- * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
- * clock.
+ * CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
+ * crystal clock.
*/
if (crystal_khz == 0 &&
- boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_D)
+ boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
crystal_khz = 25000;
/*
@@ -698,10 +702,10 @@ unsigned long native_calibrate_tsc(void)
* clock, but we can easily calculate it to a high degree of accuracy
* by considering the crystal ratio and the CPU speed.
*/
- if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
+ if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
unsigned int eax_base_mhz, ebx, ecx, edx;
- cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
crystal_khz = eax_base_mhz * 1000 *
eax_denominator / ebx_numerator;
}
@@ -713,7 +717,7 @@ unsigned long native_calibrate_tsc(void)
* For Atom SoCs TSC is the only reliable clocksource.
* Mark TSC reliable so no watchdog on it.
*/
- if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ if (boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT)
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
#ifdef CONFIG_X86_LOCAL_APIC
@@ -736,12 +740,12 @@ static unsigned long cpu_khz_from_cpuid(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return 0;
- if (boot_cpu_data.cpuid_level < 0x16)
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
return 0;
eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
- cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
+ cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
return eax_base_mhz * 1000;
}
@@ -1065,18 +1069,16 @@ core_initcall(cpufreq_register_tsc_scaling);
#endif /* CONFIG_CPU_FREQ */
-#define ART_CPUID_LEAF (0x15)
#define ART_MIN_DENOMINATOR (1)
-
/*
* If ART is present detect the numerator:denominator to convert to TSC
*/
static void __init detect_art(void)
{
- unsigned int unused[2];
+ unsigned int unused;
- if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
+ if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
return;
/*
@@ -1089,13 +1091,14 @@ static void __init detect_art(void)
tsc_async_resets)
return;
- cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
- &art_to_tsc_numerator, unused, unused+1);
+ cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
+ &art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
- if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ art_base_clk.freq_khz /= KHZ;
+ if (art_base_clk.denominator < ART_MIN_DENOMINATOR)
return;
- rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset);
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
@@ -1168,6 +1171,7 @@ static struct clocksource clocksource_tsc_early = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
+ .id = CSID_X86_TSC_EARLY,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
@@ -1190,6 +1194,7 @@ static struct clocksource clocksource_tsc = {
CLOCK_SOURCE_VALID_FOR_HRES |
CLOCK_SOURCE_MUST_VERIFY |
CLOCK_SOURCE_VERIFY_PERCPU,
+ .id = CSID_X86_TSC,
.vdso_clock_mode = VDSO_CLOCKMODE_TSC,
.enable = tsc_cs_enable,
.resume = tsc_resume,
@@ -1250,15 +1255,12 @@ static void __init check_system_tsc_reliable(void)
* - TSC which does not stop in C-States
* - the TSC_ADJUST register which allows to detect even minimal
* modifications
- * - not more than two sockets. As the number of sockets cannot be
- * evaluated at the early boot stage where this has to be
- * invoked, check the number of online memory nodes as a
- * fallback solution which is an reasonable estimate.
+ * - not more than four packages
*/
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
- nr_online_nodes <= 4)
+ topology_max_packages() <= 4)
tsc_disable_clocksource_watchdog();
}
@@ -1287,77 +1289,18 @@ int unsynchronized_tsc(void)
*/
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
- if (num_possible_cpus() > 1)
+ if (topology_max_packages() > 1)
return 1;
}
return 0;
}
-/*
- * Convert ART to TSC given numerator/denominator found in detect_art()
- */
-struct system_counterval_t convert_art_to_tsc(u64 art)
-{
- u64 tmp, res, rem;
-
- rem = do_div(art, art_to_tsc_denominator);
-
- res = art * art_to_tsc_numerator;
- tmp = rem * art_to_tsc_numerator;
-
- do_div(tmp, art_to_tsc_denominator);
- res += tmp + art_to_tsc_offset;
-
- return (struct system_counterval_t) {.cs = art_related_clocksource,
- .cycles = res};
-}
-EXPORT_SYMBOL(convert_art_to_tsc);
-
-/**
- * convert_art_ns_to_tsc() - Convert ART in nanoseconds to TSC.
- * @art_ns: ART (Always Running Timer) in unit of nanoseconds
- *
- * PTM requires all timestamps to be in units of nanoseconds. When user
- * software requests a cross-timestamp, this function converts system timestamp
- * to TSC.
- *
- * This is valid when CPU feature flag X86_FEATURE_TSC_KNOWN_FREQ is set
- * indicating the tsc_khz is derived from CPUID[15H]. Drivers should check
- * that this flag is set before conversion to TSC is attempted.
- *
- * Return:
- * struct system_counterval_t - system counter value with the pointer to the
- * corresponding clocksource
- * @cycles: System counter value
- * @cs: Clocksource corresponding to system counter value. Used
- * by timekeeping code to verify comparability of two cycle
- * values.
- */
-
-struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns)
-{
- u64 tmp, res, rem;
-
- rem = do_div(art_ns, USEC_PER_SEC);
-
- res = art_ns * tsc_khz;
- tmp = rem * tsc_khz;
-
- do_div(tmp, USEC_PER_SEC);
- res += tmp;
-
- return (struct system_counterval_t) { .cs = art_related_clocksource,
- .cycles = res};
-}
-EXPORT_SYMBOL(convert_art_ns_to_tsc);
-
-
static void tsc_refine_calibration_work(struct work_struct *work);
static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
/**
* tsc_refine_calibration_work - Further refine tsc freq calibration
- * @work - ignored.
+ * @work: ignored.
*
* This functions uses delayed work over a period of a
* second to further refine the TSC freq value. Since this is
@@ -1454,8 +1397,10 @@ out:
if (tsc_unstable)
goto unreg;
- if (boot_cpu_has(X86_FEATURE_ART))
- art_related_clocksource = &clocksource_tsc;
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
clocksource_register_khz(&clocksource_tsc, tsc_khz);
unreg:
clocksource_unregister(&clocksource_tsc_early);
@@ -1480,8 +1425,10 @@ static int __init init_tsc_clocksource(void)
* the refined calibration and directly register it as a clocksource.
*/
if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
- if (boot_cpu_has(X86_FEATURE_ART))
- art_related_clocksource = &clocksource_tsc;
+ if (boot_cpu_has(X86_FEATURE_ART)) {
+ have_art = true;
+ clocksource_tsc.base = &art_base_clk;
+ }
clocksource_register_khz(&clocksource_tsc, tsc_khz);
clocksource_unregister(&clocksource_tsc_early);
@@ -1505,10 +1452,12 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
if (early) {
cpu_khz = x86_platform.calibrate_cpu();
- if (tsc_early_khz)
+ if (tsc_early_khz) {
tsc_khz = tsc_early_khz;
- else
+ } else {
tsc_khz = x86_platform.calibrate_tsc();
+ clocksource_tsc.freq_khz = tsc_khz;
+ }
} else {
/* We should not be here with non-native cpu calibration */
WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);
@@ -1566,6 +1515,9 @@ void __init tsc_early_init(void)
/* Don't change UV TSC multi-chassis synchronization */
if (is_early_uv_system())
return;
+
+ snp_secure_tsc_init();
+
if (!determine_cpu_tsc_frequencies(true))
return;
tsc_enable_sched_clock();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 6555a857a1e6..deeb02825670 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -147,13 +147,13 @@ static const struct freq_desc freq_desc_lgm = {
};
static const struct x86_cpu_id tsc_msr_cpu_ids[] = {
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_MID, &freq_desc_pnw),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SALTWELL_TABLET,&freq_desc_clv),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &freq_desc_byt),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_MID, &freq_desc_tng),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &freq_desc_cht),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_MID, &freq_desc_ann),
- X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT_NP, &freq_desc_lgm),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_MID, &freq_desc_pnw),
+ X86_MATCH_VFM(INTEL_ATOM_SALTWELL_TABLET, &freq_desc_clv),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt),
+ X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann),
+ X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm),
{}
};
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1123ef3ccf90..4334033658ed 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -193,11 +193,9 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu)
cur->warned = false;
/*
- * If a non-zero TSC value for socket 0 may be valid then the default
- * adjusted value cannot assumed to be zero either.
+ * The default adjust value cannot be assumed to be zero on any socket.
*/
- if (tsc_async_resets)
- cur->adjusted = bootval;
+ cur->adjusted = bootval;
/*
* Check whether this CPU is the first in a package to come up. In
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index d00c28aaa5be..d4705a348a80 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -723,7 +723,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
state->sp = task->thread.sp + sizeof(*frame);
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
- state->signal = (void *)state->ip == ret_from_fork;
+ state->signal = (void *)state->ip == ret_from_fork_asm;
}
if (get_stack_info((unsigned long *)state->sp, state->task,
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c07f6daaa22..5a952c5ea66b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -12,6 +12,7 @@
#include <linux/ptrace.h>
#include <linux/uprobes.h>
#include <linux/uaccess.h>
+#include <linux/syscalls.h>
#include <linux/kdebug.h>
#include <asm/processor.h>
@@ -308,6 +309,122 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
}
#ifdef CONFIG_X86_64
+
+asm (
+ ".pushsection .rodata\n"
+ ".global uretprobe_trampoline_entry\n"
+ "uretprobe_trampoline_entry:\n"
+ "pushq %rax\n"
+ "pushq %rcx\n"
+ "pushq %r11\n"
+ "movq $" __stringify(__NR_uretprobe) ", %rax\n"
+ "syscall\n"
+ ".global uretprobe_syscall_check\n"
+ "uretprobe_syscall_check:\n"
+ "popq %r11\n"
+ "popq %rcx\n"
+
+ /* The uretprobe syscall replaces stored %rax value with final
+ * return address, so we don't restore %rax in here and just
+ * call ret.
+ */
+ "retq\n"
+ ".global uretprobe_trampoline_end\n"
+ "uretprobe_trampoline_end:\n"
+ ".popsection\n"
+);
+
+extern u8 uretprobe_trampoline_entry[];
+extern u8 uretprobe_trampoline_end[];
+extern u8 uretprobe_syscall_check[];
+
+void *arch_uprobe_trampoline(unsigned long *psize)
+{
+ static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+ struct pt_regs *regs = task_pt_regs(current);
+
+ /*
+ * At the moment the uretprobe syscall trampoline is supported
+ * only for native 64-bit process, the compat process still uses
+ * standard breakpoint.
+ */
+ if (user_64bit_mode(regs)) {
+ *psize = uretprobe_trampoline_end - uretprobe_trampoline_entry;
+ return uretprobe_trampoline_entry;
+ }
+
+ *psize = UPROBE_SWBP_INSN_SIZE;
+ return &insn;
+}
+
+static unsigned long trampoline_check_ip(void)
+{
+ unsigned long tramp = uprobe_get_trampoline_vaddr();
+
+ return tramp + (uretprobe_syscall_check - uretprobe_trampoline_entry);
+}
+
+SYSCALL_DEFINE0(uretprobe)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+ unsigned long err, ip, sp, r11_cx_ax[3];
+
+ if (regs->ip != trampoline_check_ip())
+ goto sigill;
+
+ err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
+ if (err)
+ goto sigill;
+
+ /* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
+ regs->r11 = r11_cx_ax[0];
+ regs->cx = r11_cx_ax[1];
+ regs->ax = r11_cx_ax[2];
+ regs->sp += sizeof(r11_cx_ax);
+ regs->orig_ax = -1;
+
+ ip = regs->ip;
+ sp = regs->sp;
+
+ uprobe_handle_trampoline(regs);
+
+ /*
+ * Some of the uprobe consumers has changed sp, we can do nothing,
+ * just return via iret.
+ * .. or shadow stack is enabled, in which case we need to skip
+ * return through the user space stack address.
+ */
+ if (regs->sp != sp || shstk_is_enabled())
+ return regs->ax;
+ regs->sp -= sizeof(r11_cx_ax);
+
+ /* for the case uprobe_consumer has changed r11/cx */
+ r11_cx_ax[0] = regs->r11;
+ r11_cx_ax[1] = regs->cx;
+
+ /*
+ * ax register is passed through as return value, so we can use
+ * its space on stack for ip value and jump to it through the
+ * trampoline's ret instruction
+ */
+ r11_cx_ax[2] = regs->ip;
+ regs->ip = ip;
+
+ err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
+ if (err)
+ goto sigill;
+
+ /* ensure sysret, see do_syscall_64() */
+ regs->r11 = regs->flags;
+ regs->cx = regs->ip;
+
+ return regs->ax;
+
+sigill:
+ force_sig(SIGILL);
+ return -1;
+}
+
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
@@ -1076,8 +1193,13 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!nleft))
+ if (likely(!nleft)) {
+ if (shstk_update_last_frame(trampoline_vaddr)) {
+ force_sig(SIGSEGV);
+ return -1;
+ }
return orig_ret_vaddr;
+ }
if (nleft != rasize) {
pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n",
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e9e803a4d44c..e6cc84143f3e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -246,9 +246,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
/* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
if (v.flags & VM86_SCREEN_BITMAP) {
- char comm[TASK_COMM_LEN];
-
- pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
+ pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n",
+ current->comm);
return -EINVAL;
}
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/vmcore_info_32.c
index 8a89c109e20a..5995a749288a 100644
--- a/arch/x86/kernel/crash_core_32.c
+++ b/arch/x86/kernel/vmcore_info_32.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/pgtable.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/vmcore_info_64.c
index 7d255f882afe..0dec7d868754 100644
--- a/arch/x86/kernel/crash_core_64.c
+++ b/arch/x86/kernel/vmcore_info_64.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/pgtable.h>
#include <asm/setup.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 54a5596adaa6..0deb4887d6e9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -15,11 +15,7 @@
* put it inside the section definition.
*/
-#ifdef CONFIG_X86_32
-#define LOAD_OFFSET __PAGE_OFFSET
-#else
#define LOAD_OFFSET __START_KERNEL_map
-#endif
#define RUNTIME_DISCARD_EXIT
#define EMITS_PT_NOTE
@@ -32,6 +28,7 @@
#include <asm/orc_lookup.h>
#include <asm/cache.h>
#include <asm/boot.h>
+#include <asm/kexec.h>
#undef i386 /* in case the preprocessor is a 32bit one */
@@ -46,6 +43,7 @@ ENTRY(phys_startup_64)
#endif
jiffies = jiffies_64;
+const_pcpu_hot = pcpu_hot;
#if defined(CONFIG_X86_64)
/*
@@ -98,7 +96,19 @@ jiffies = jiffies_64;
#define BSS_DECRYPTED
#endif
-
+#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
+#define KEXEC_RELOCATE_KERNEL \
+ . = ALIGN(0x100); \
+ __relocate_kernel_start = .; \
+ *(.text..relocate_kernel); \
+ *(.data..relocate_kernel); \
+ __relocate_kernel_end = .;
+
+ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE,
+ "relocate_kernel code too large!")
+#else
+#define KEXEC_RELOCATE_KERNEL
+#endif
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(6); /* RW_ */
@@ -113,11 +123,10 @@ PHDRS {
SECTIONS
{
+ . = __START_KERNEL;
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
@@ -125,24 +134,11 @@ SECTIONS
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
_stext = .;
- /* bootstrapping code */
- HEAD_TEXT
- TEXT_TEXT
- SCHED_TEXT
- LOCK_TEXT
- KPROBES_TEXT
- SOFTIRQENTRY_TEXT
-#ifdef CONFIG_RETPOLINE
- *(.text..__x86.indirect_thunk)
- *(.text..__x86.return_thunk)
-#endif
- STATIC_CALL_TEXT
-
ALIGN_ENTRY_TEXT_BEGIN
*(.text..__x86.rethunk_untrain)
ENTRY_TEXT
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
/*
* See the comment above srso_alias_untrain_ret()'s
* definition.
@@ -151,10 +147,26 @@ SECTIONS
*(.text..__x86.rethunk_safe)
#endif
ALIGN_ENTRY_TEXT_END
+
+ TEXT_TEXT
+ SCHED_TEXT
+ LOCK_TEXT
+ KPROBES_TEXT
+ SOFTIRQENTRY_TEXT
+#ifdef CONFIG_MITIGATION_RETPOLINE
+ *(.text..__x86.indirect_thunk)
+ *(.text..__x86.return_thunk)
+#endif
+ STATIC_CALL_TEXT
*(.gnu.warning)
} :text = 0xcccccccc
+ /* bootstrapping code */
+ .head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
+ HEAD_TEXT
+ } :text = 0xcccccccc
+
/* End of text section, which should occupy whole number of pages */
_etext = .;
. = ALIGN(PAGE_SIZE);
@@ -171,6 +183,9 @@ SECTIONS
/* init_task */
INIT_TASK_DATA(THREAD_SIZE)
+ /* equivalent to task_pt_regs(&init_task) */
+ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+
#ifdef CONFIG_X86_32
/* 32 bit has nosave before _edata */
NOSAVE_DATA
@@ -182,6 +197,7 @@ SECTIONS
DATA_DATA
CONSTRUCTORS
+ KEXEC_RELOCATE_KERNEL
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
@@ -194,29 +210,6 @@ SECTIONS
ORC_UNWIND_TABLE
- . = ALIGN(PAGE_SIZE);
- __vvar_page = .;
-
- .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
- /* work around gold bug 13023 */
- __vvar_beginning_hack = .;
-
- /* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) \
- . = __vvar_beginning_hack + offset; \
- *(.vvar_ ## name)
-#include <asm/vvar.h>
-#undef EMIT_VVAR
-
- /*
- * Pad the rest of the page with zeros. Otherwise the loader
- * can leave garbage here.
- */
- . = __vvar_beginning_hack + PAGE_SIZE;
- } :data
-
- . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
-
/* Init code and data - will be freed after init */
. = ALIGN(PAGE_SIZE);
.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
@@ -267,20 +260,7 @@ SECTIONS
}
#endif
- /*
- * start address and size of operations which during runtime
- * can be patched with virtualization friendly instructions or
- * baremetal native ones. Think page table operations.
- * Details in paravirt_types.h
- */
- . = ALIGN(8);
- .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
- __parainstructions = .;
- *(.parainstructions)
- __parainstructions_end = .;
- }
-
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
/*
* List of instructions that call/jmp/jcc to retpoline thunks
* __x86_indirect_thunk_*(). These instructions can be patched along
@@ -371,6 +351,9 @@ SECTIONS
PERCPU_SECTION(INTERNODE_CACHE_BYTES)
#endif
+ RUNTIME_CONST_VARIABLES
+ RUNTIME_CONST(ptr, USER_PTR_MAX)
+
. = ALIGN(PAGE_SIZE);
/* freed after init ends here */
@@ -454,6 +437,10 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
+#ifdef CONFIG_PROPELLER_CLANG
+ .llvm_bb_addr_map : { *(.llvm_bb_addr_map) }
+#endif
+
ELF_DETAILS
DISCARDS
@@ -502,6 +489,9 @@ SECTIONS
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
+/* needed for Clang - see arch/x86/entry/entry.S */
+PROVIDE(__ref_stack_chk_guard = __stack_chk_guard);
+
#ifdef CONFIG_X86_64
/*
* Per-cpu symbols which need to be offset from __per_cpu_load
@@ -517,11 +507,11 @@ INIT_PER_CPU(irq_stack_backing_store);
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#ifdef CONFIG_CPU_UNRET_ENTRY
+#ifdef CONFIG_MITIGATION_UNRET_ENTRY
. = ASSERT((retbleed_return_thunk & 0x3f) == 0, "retbleed_return_thunk not cacheline-aligned");
#endif
-#ifdef CONFIG_CPU_SRSO
+#ifdef CONFIG_MITIGATION_SRSO
. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned");
/*
* GNU ld cannot do XOR until 2.41.
@@ -539,3 +529,18 @@ INIT_PER_CPU(irq_stack_backing_store);
#endif
#endif /* CONFIG_X86_64 */
+
+/*
+ * The symbols below are referenced using relative relocations in the
+ * respective ELF notes. This produces build time constants that the
+ * linker will never mark as relocatable. (Using just ABSOLUTE() is not
+ * sufficient for that).
+ */
+#ifdef CONFIG_XEN_PV
+xen_elfnote_entry_value =
+ ABSOLUTE(xen_elfnote_entry) + ABSOLUTE(startup_xen);
+#endif
+#ifdef CONFIG_PVH
+xen_elfnote_phys32_entry_value =
+ ABSOLUTE(xen_elfnote_phys32_entry) + ABSOLUTE(pvh_start_xen - LOAD_OFFSET);
+#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index d3fc01770558..73511332bb67 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -127,25 +127,12 @@ static void __init vsmp_cap_cpus(void)
#endif
}
-static u32 apicid_phys_pkg_id(u32 initial_apic_id, int index_msb)
-{
- return read_apic_id() >> index_msb;
-}
-
-static void vsmp_apic_post_init(void)
-{
- /* need to update phys_pkg_id */
- apic->phys_pkg_id = apicid_phys_pkg_id;
-}
-
void __init vsmp_init(void)
{
detect_vsmp_box();
if (!is_vsmp_box())
return;
- x86_platform.apic_post_init = vsmp_apic_post_init;
-
vsmp_cap_cpus();
set_vsmp_ctl();
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a37ebd3b4773..0a2bbd674a6d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -3,10 +3,12 @@
*
* For licencing details see kernel-base/COPYING
*/
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/export.h>
#include <linux/pci.h>
+#include <linux/acpi.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
@@ -66,12 +68,14 @@ struct x86_init_ops x86_init __initdata = {
.probe_roms = probe_roms,
.reserve_resources = reserve_standard_io_resources,
.memory_setup = e820__memory_setup_default,
+ .dmi_setup = dmi_setup,
},
.mpparse = {
.setup_ioapic_ids = x86_init_noop,
- .find_smp_config = default_find_smp_config,
- .get_smp_config = default_get_smp_config,
+ .find_mptable = mpparse_find_mptable,
+ .early_parse_smp_cfg = mpparse_parse_early_smp_config,
+ .parse_smp_cfg = mpparse_parse_smp_config,
},
.irqs = {
@@ -131,10 +135,12 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
-static bool enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return true; }
-static bool enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return true; }
+static int enc_status_change_prepare_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
+static int enc_status_change_finish_noop(unsigned long vaddr, int npages, bool enc) { return 0; }
static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
+static void enc_kexec_begin_noop(void) {}
+static void enc_kexec_finish_noop(void) {}
static bool is_private_mmio_noop(u64 addr) {return false; }
struct x86_platform_ops x86_platform __ro_after_init = {
@@ -158,6 +164,8 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.enc_status_change_finish = enc_status_change_finish_noop,
.enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
+ .enc_kexec_begin = enc_kexec_begin_noop,
+ .enc_kexec_finish = enc_kexec_finish_noop,
},
};