summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/events/amd/uncore.c5
-rw-r--r--arch/x86/events/intel/core.c3
-rw-r--r--arch/x86/hyperv/.gitignore1
-rw-r--r--arch/x86/hyperv/Makefile16
-rw-r--r--arch/x86/hyperv/hv_apic.c8
-rw-r--r--arch/x86/hyperv/hv_crash.c642
-rw-r--r--arch/x86/hyperv/hv_init.c9
-rw-r--r--arch/x86/hyperv/hv_trampoline.S101
-rw-r--r--arch/x86/hyperv/hv_vtl.c30
-rw-r--r--arch/x86/hyperv/mshv-asm-offsets.c37
-rw-r--r--arch/x86/hyperv/mshv_vtl_asm.S116
-rw-r--r--arch/x86/include/asm/alternative.h9
-rw-r--r--arch/x86/include/asm/asm.h25
-rw-r--r--arch/x86/include/asm/bug.h6
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/cpumask.h2
-rw-r--r--arch/x86/include/asm/div64.h39
-rw-r--r--arch/x86/include/asm/irq_remapping.h7
-rw-r--r--arch/x86/include/asm/irq_stack.h2
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/mshyperv.h45
-rw-r--r--arch/x86/include/asm/nospec-branch.h4
-rw-r--r--arch/x86/include/asm/paravirt_types.h2
-rw-r--r--arch/x86/include/asm/smap.h8
-rw-r--r--arch/x86/include/asm/static_call.h2
-rw-r--r--arch/x86/include/asm/string_64.h6
-rw-r--r--arch/x86/include/asm/uaccess_64.h2
-rw-r--r--arch/x86/include/asm/uv/bios.h2
-rw-r--r--arch/x86/include/asm/x86_init.h28
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/amd_gart_64.c19
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/cpu/mce/core.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c88
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/irq.c23
-rw-r--r--arch/x86/kernel/rethook.c2
-rw-r--r--arch/x86/kernel/static_call.c4
-rw-r--r--arch/x86/kernel/unwind_orc.c39
-rw-r--r--arch/x86/kvm/cpuid.c11
-rw-r--r--arch/x86/kvm/svm/nested.c4
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/svm/svm.h7
-rw-r--r--arch/x86/kvm/vmx/nested.c3
-rw-r--r--arch/x86/kvm/vmx/vmx.c9
-rw-r--r--arch/x86/kvm/x86.c7
-rw-r--r--arch/x86/lib/error-inject.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c12
-rw-r--r--arch/x86/xen/enlighten_pv.c2
52 files changed, 1286 insertions, 130 deletions
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index e8b6af199c73..9293ce50574d 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -656,14 +656,11 @@ static int amd_uncore_df_event_init(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
int ret = amd_uncore_event_init(event);
- if (ret || pmu_version < 2)
- return ret;
-
hwc->config = event->attr.config &
(pmu_version >= 2 ? AMD64_PERFMON_V2_RAW_EVENT_MASK_NB :
AMD64_RAW_EVENT_MASK_NB);
- return 0;
+ return ret;
}
static int amd_uncore_df_add(struct perf_event *event, int flags)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 853fe073bab3..bdf3f0d0fe21 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3378,6 +3378,9 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
if (!test_bit(bit, cpuc->active_mask))
continue;
+ /* Event may have already been cleared: */
+ if (!event)
+ continue;
/*
* There may be unprocessed PEBS records in the PEBS buffer,
diff --git a/arch/x86/hyperv/.gitignore b/arch/x86/hyperv/.gitignore
new file mode 100644
index 000000000000..333615d993b5
--- /dev/null
+++ b/arch/x86/hyperv/.gitignore
@@ -0,0 +1 @@
+mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index d55f494f471d..56292102af62 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,8 +1,22 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o
obj-$(CONFIG_X86_64) += hv_apic.o
-obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o
+obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o
+
+$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
+
+$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
+ $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
ifdef CONFIG_X86_64
obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o
+
+ ifdef CONFIG_MSHV_ROOT
+ CFLAGS_REMOVE_hv_trampoline.o += -pg
+ CFLAGS_hv_trampoline.o += -fno-stack-protector
+ obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o
+ endif
endif
+
+targets += mshv-asm-offsets.s
+clean-files += mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index bfde0a3498b9..a8de503def37 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
wrmsrq(HV_X64_MSR_ICR, reg_val);
}
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+ apic_update_vector(cpu, vector, set);
+}
+
static u32 hv_apic_read(u32 reg)
{
u32 reg_val, hi;
@@ -293,6 +298,9 @@ static void hv_send_ipi_self(int vector)
void __init hv_apic_init(void)
{
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ return;
+
if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
pr_info("Hyper-V: Using IPI hypercalls\n");
/*
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
new file mode 100644
index 000000000000..c0e22921ace1
--- /dev/null
+++ b/arch/x86/hyperv/hv_crash.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * X86 specific Hyper-V root partition kdump/crash support module
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ * This module implements hypervisor RAM collection into vmcore for both
+ * cases of the hypervisor crash and Linux root crash. Hyper-V implements
+ * a disable hypercall with a 32bit protected mode ABI callback. This
+ * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
+ * is already mapped in Linux, it is automatically collected into Linux vmcore,
+ * and can be examined by the crash command (raw RAM dump) or windbg.
+ *
+ * At a high level:
+ *
+ * Hypervisor Crash:
+ * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
+ * restrictive mode with very limited hypercall and MSR support. Each cpu
+ * then injects NMIs into root vcpus. A shared page is used to check
+ * by Linux in the NMI handler if the hypervisor has crashed. This shared
+ * page is setup in hv_root_crash_init during boot.
+ *
+ * Linux Crash:
+ * In case of Linux crash, the callback hv_crash_stop_other_cpus will send
+ * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
+ * for all cpus to be in NMI.
+ *
+ * NMI Handler (upon quorum):
+ * Eventually, in both cases, all cpus will end up in the NMI handler.
+ * Hyper-V requires the disable hypervisor must be done from the BSP. So
+ * the BSP NMI handler saves current context, does some fixups and makes
+ * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
+ * at that point will suspend all vcpus (except the BSP), unlock all its
+ * RAM, and return to Linux at the 32bit mode entry RIP.
+ *
+ * Linux 32bit entry trampoline will then restore long mode and call C
+ * function here to restore context and continue execution to crash kexec.
+ */
+
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/panic.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/mshyperv.h>
+#include <asm/nmi.h>
+#include <asm/idtentry.h>
+#include <asm/reboot.h>
+#include <asm/intel_pt.h>
+
+bool hv_crash_enabled;
+EXPORT_SYMBOL_GPL(hv_crash_enabled);
+
+struct hv_crash_ctxt {
+ ulong rsp;
+ ulong cr0;
+ ulong cr2;
+ ulong cr4;
+ ulong cr8;
+
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 es;
+ u16 fs;
+ u16 gs;
+
+ u16 gdt_fill;
+ struct desc_ptr gdtr;
+ char idt_fill[6];
+ struct desc_ptr idtr;
+
+ u64 gsbase;
+ u64 efer;
+ u64 pat;
+};
+static struct hv_crash_ctxt hv_crash_ctxt;
+
+/* Shared hypervisor page that contains crash dump area we peek into.
+ * NB: windbg looks for "hv_cda" symbol so don't change it.
+ */
+static struct hv_crashdump_area *hv_cda;
+
+static u32 trampoline_pa, devirt_arg;
+static atomic_t crash_cpus_wait;
+static void *hv_crash_ptpgs[4];
+static bool hv_has_crashed, lx_has_crashed;
+
+static void __noreturn hv_panic_timeout_reboot(void)
+{
+ #define PANIC_TIMER_STEP 100
+
+ if (panic_timeout > 0) {
+ int i;
+
+ for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
+ mdelay(PANIC_TIMER_STEP);
+ }
+
+ if (panic_timeout)
+ native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */
+
+ for (;;)
+ cpu_relax();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline __noclone void hv_crash_restore_tss(void)
+{
+ load_TR_desc();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline void hv_crash_clear_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ /* Clear entry so it's not confusing to someone looking at the core */
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+ native_p4d_clear(p4d);
+}
+
+/*
+ * This is the C entry point from the asm glue code after the disable hypercall.
+ * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
+ * page tables with our below 4G page identity mapped, but using a temporary
+ * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
+ * available. We restore kernel GDT, and rest of the context, and continue
+ * to kexec.
+ */
+static asmlinkage void __noreturn hv_crash_c_entry(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ /* first thing, restore kernel gdt */
+ native_load_gdt(&ctxt->gdtr);
+
+ asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
+ asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+
+ asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
+ asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
+ asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
+ asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+
+ native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
+ asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+
+ asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
+ asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
+ asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+
+ native_load_idt(&ctxt->idtr);
+ native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
+ native_wrmsrq(MSR_EFER, ctxt->efer);
+
+ /* restore the original kernel CS now via far return */
+ asm volatile("movzwq %0, %%rax\n\t"
+ "pushq %%rax\n\t"
+ "pushq $1f\n\t"
+ "lretq\n\t"
+ "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
+
+ /* We are in asmlinkage without stack frame, hence make C function
+ * calls which will buy stack frames.
+ */
+ hv_crash_restore_tss();
+ hv_crash_clear_kernpt();
+
+ /* we are now fully in devirtualized normal kernel mode */
+ __crash_kexec(NULL);
+
+ hv_panic_timeout_reboot();
+}
+/* Tell gcc we are using lretq long jump in the above function intentionally */
+STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
+
+static void hv_mark_tss_not_busy(void)
+{
+ struct desc_struct *desc = get_current_gdt_rw();
+ tss_desc tss;
+
+ memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+ tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */
+ write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+}
+
+/* Save essential context */
+static void hv_hvcrash_ctxt_save(void)
+{
+ struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+ asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
+
+ ctxt->cr0 = native_read_cr0();
+ ctxt->cr4 = native_read_cr4();
+
+ asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
+ asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
+
+ asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
+ asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
+ asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
+ asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
+ asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
+ asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
+
+ native_store_gdt(&ctxt->gdtr);
+ store_idt(&ctxt->idtr);
+
+ ctxt->gsbase = __rdmsr(MSR_GS_BASE);
+ ctxt->efer = __rdmsr(MSR_EFER);
+ ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
+}
+
+/* Add trampoline page to the kernel pagetable for transition to kernel PT */
+static void hv_crash_fixup_kernpt(void)
+{
+ pgd_t *pgd;
+ p4d_t *p4d;
+
+ pgd = pgd_offset_k(trampoline_pa);
+ p4d = p4d_offset(pgd, trampoline_pa);
+
+ /* trampoline_pa is below 4G, so no pre-existing entry to clobber */
+ p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
+ p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */
+}
+
+/*
+ * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
+ * and suspend all guest VPs.
+ */
+static void hv_notify_prepare_hyp(void)
+{
+ u64 status;
+ struct hv_input_notify_partition_event *input;
+ struct hv_partition_event_root_crashdump_input *cda;
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ cda = &input->input.crashdump_input;
+ memset(input, 0, sizeof(*input));
+ input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
+
+ cda->crashdump_action = HV_CRASHDUMP_ENTRY;
+ status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+ if (!hv_result_success(status))
+ return;
+
+ cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
+ hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+}
+
+/*
+ * Common function for all cpus before devirtualization.
+ *
+ * Hypervisor crash: all cpus get here in NMI context.
+ * Linux crash: the panicing cpu gets here at base level, all others in NMI
+ * context. Note, panicing cpu may not be the BSP.
+ *
+ * The function is not inlined so it will show on the stack. It is named so
+ * because the crash cmd looks for certain well known function names on the
+ * stack before looking into the cpu saved note in the elf section, and
+ * that work is currently incomplete.
+ *
+ * Notes:
+ * Hypervisor crash:
+ * - the hypervisor is in a very restrictive mode at this point and any
+ * vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
+ * just get to kexec as quickly as possible.
+ *
+ * Devirtualization is supported from the BSP only at present.
+ */
+static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
+{
+ struct hv_input_disable_hyp_ex *input;
+ u64 status;
+ int msecs = 1000, ccpu = smp_processor_id();
+
+ if (ccpu == 0) {
+ /* crash_save_cpu() will be done in the kexec path */
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ } else {
+ crash_save_cpu(regs, ccpu);
+ cpu_emergency_stop_pt(); /* disable performance trace */
+ atomic_inc(&crash_cpus_wait);
+ for (;;)
+ cpu_relax();
+ }
+
+ while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
+ mdelay(1);
+
+ stop_nmi();
+ if (!hv_has_crashed)
+ hv_notify_prepare_hyp();
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ hv_hvcrash_ctxt_save();
+ hv_mark_tss_not_busy();
+ hv_crash_fixup_kernpt();
+
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ memset(input, 0, sizeof(*input));
+ input->rip = trampoline_pa;
+ input->arg = devirt_arg;
+
+ status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
+
+ hv_panic_timeout_reboot();
+}
+
+
+static DEFINE_SPINLOCK(hv_crash_reboot_lk);
+
+/*
+ * Generic NMI callback handler: could be called without any crash also.
+ * hv crash: hypervisor injects NMI's into all cpus
+ * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
+ */
+static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
+{
+ if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
+ hv_has_crashed = true;
+
+ if (!hv_has_crashed && !lx_has_crashed)
+ return NMI_DONE; /* ignore the NMI */
+
+ if (hv_has_crashed && !kexec_crash_loaded()) {
+ if (spin_trylock(&hv_crash_reboot_lk))
+ hv_panic_timeout_reboot();
+ else
+ for (;;)
+ cpu_relax();
+ }
+
+ crash_nmi_callback(regs);
+
+ return NMI_DONE;
+}
+
+/*
+ * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
+ *
+ * On normal Linux panic, this is called twice: first from panic and then again
+ * from native_machine_crash_shutdown.
+ *
+ * In case of hyperv, 3 ways to get here:
+ * 1. hv crash (only BSP will get here):
+ * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
+ * -> __crash_kexec -> native_machine_crash_shutdown
+ * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
+ * Linux panic:
+ * 2. panic cpu x: panic() -> crash_smp_send_stop
+ * -> smp_ops.crash_stop_other_cpus
+ * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
+ *
+ * NB: noclone and non standard stack because of call to crash_setup_regs().
+ */
+static void __noclone hv_crash_stop_other_cpus(void)
+{
+ static bool crash_stop_done;
+ struct pt_regs lregs;
+ int ccpu = smp_processor_id();
+
+ if (hv_has_crashed)
+ return; /* all cpus already in NMI handler path */
+
+ if (!kexec_crash_loaded()) {
+ hv_notify_prepare_hyp();
+ hv_panic_timeout_reboot(); /* no return */
+ }
+
+ /* If the hv crashes also, we could come here again before cpus_stopped
+ * is set in crash_smp_send_stop(). So use our own check.
+ */
+ if (crash_stop_done)
+ return;
+ crash_stop_done = true;
+
+ /* Linux has crashed: hv is healthy, we can IPI safely */
+ lx_has_crashed = true;
+ wmb(); /* NMI handlers look at lx_has_crashed */
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ if (crashing_cpu == -1)
+ crashing_cpu = ccpu; /* crash cmd uses this */
+
+ /* crash_setup_regs() happens in kexec also, but for the kexec cpu which
+ * is the BSP. We could be here on non-BSP cpu, collect regs if so.
+ */
+ if (ccpu)
+ crash_setup_regs(&lregs, NULL);
+
+ crash_nmi_callback(&lregs);
+}
+STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
+
+/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
+struct hv_gdtreg_32 {
+ u16 fill;
+ u16 limit;
+ u32 address;
+} __packed;
+
+/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
+struct hv_crash_tramp_gdt {
+ u64 null; /* index 0, selector 0, null selector */
+ u64 cs64; /* index 1, selector 8, cs64 selector */
+} __packed;
+
+/* No stack, so jump via far ptr in memory to load the 64bit CS */
+struct hv_cs_jmptgt {
+ u32 address;
+ u16 csval;
+ u16 fill;
+} __packed;
+
+/* Linux use only, hypervisor doesn't look at this struct */
+struct hv_crash_tramp_data {
+ u64 tramp32_cr3;
+ u64 kernel_cr3;
+ struct hv_gdtreg_32 gdtr32;
+ struct hv_crash_tramp_gdt tramp_gdt;
+ struct hv_cs_jmptgt cs_jmptgt;
+ u64 c_entry_addr;
+} __packed;
+
+/*
+ * Setup a temporary gdt to allow the asm code to switch to the long mode.
+ * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
+ * relative addressing, hence we must use trampoline_pa here. Also, save other
+ * info like jmp and C entry targets for same reasons.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+static int hv_crash_setup_trampdata(u64 trampoline_va)
+{
+ int size, offs;
+ void *dest;
+ struct hv_crash_tramp_data *tramp;
+
+ /* These must match exactly the ones in the corresponding asm file */
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
+ cs_jmptgt.address) != 40);
+ BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
+
+ /* hv_crash_asm_end is beyond last byte by 1 */
+ size = &hv_crash_asm_end - &hv_crash_asm32;
+ if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
+ pr_err("%s: trampoline page overflow\n", __func__);
+ return -1;
+ }
+
+ dest = (void *)trampoline_va;
+ memcpy(dest, &hv_crash_asm32, size);
+
+ dest += size;
+ dest = (void *)round_up((ulong)dest, 16);
+ tramp = (struct hv_crash_tramp_data *)dest;
+
+ /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
+ * non-PCID-aware users". Build cr3 with pcid 0
+ */
+ tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
+
+ /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
+ tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
+
+ tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
+ tramp->gdtr32.address = trampoline_pa +
+ (ulong)&tramp->tramp_gdt - trampoline_va;
+
+ /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
+ tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
+
+ tramp->cs_jmptgt.csval = 0x8;
+ offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
+ tramp->cs_jmptgt.address = trampoline_pa + offs;
+
+ tramp->c_entry_addr = (u64)&hv_crash_c_entry;
+
+ devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
+
+ return 0;
+}
+
+/*
+ * Build 32bit trampoline page table for transition from protected mode
+ * non-paging to long-mode paging. This transition needs pagetables below 4G.
+ */
+static void hv_crash_build_tramp_pt(void)
+{
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ u64 pa, addr = trampoline_pa;
+
+ p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
+ pa = virt_to_phys(hv_crash_ptpgs[1]);
+ set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
+ p4d->p4d &= ~(_PAGE_NX); /* enable execute */
+
+ pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
+ pa = virt_to_phys(hv_crash_ptpgs[2]);
+ set_pud(pud, __pud(_PAGE_TABLE | pa));
+
+ pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
+ pa = virt_to_phys(hv_crash_ptpgs[3]);
+ set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
+
+ pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
+ set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+/*
+ * Setup trampoline for devirtualization:
+ * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
+ * in protected mode.
+ * - 4 pages for a temporary page table that asm code uses to turn paging on
+ * - a temporary gdt to use in the compat mode.
+ *
+ * Returns: 0 on success
+ */
+static int hv_crash_trampoline_setup(void)
+{
+ int i, rc, order;
+ struct page *page;
+ u64 trampoline_va;
+ gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
+
+ /* page for 32bit trampoline assembly code + hv_crash_tramp_data */
+ page = alloc_page(flags32);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc asm stub page\n", __func__);
+ return -1;
+ }
+
+ trampoline_va = (u64)page_to_virt(page);
+ trampoline_pa = (u32)page_to_phys(page);
+
+ order = 2; /* alloc 2^2 pages */
+ page = alloc_pages(flags32, order);
+ if (page == NULL) {
+ pr_err("%s: failed to alloc pt pages\n", __func__);
+ free_page(trampoline_va);
+ return -1;
+ }
+
+ for (i = 0; i < 4; i++, page++)
+ hv_crash_ptpgs[i] = page_to_virt(page);
+
+ hv_crash_build_tramp_pt();
+
+ rc = hv_crash_setup_trampdata(trampoline_va);
+ if (rc)
+ goto errout;
+
+ return 0;
+
+errout:
+ free_page(trampoline_va);
+ free_pages((ulong)hv_crash_ptpgs[0], order);
+
+ return rc;
+}
+
+/* Setup for kdump kexec to collect hypervisor RAM when running as root */
+void hv_root_crash_init(void)
+{
+ int rc;
+ struct hv_input_get_system_property *input;
+ struct hv_output_get_system_property *output;
+ unsigned long flags;
+ u64 status;
+ union hv_pfn_range cda_info;
+
+ if (pgtable_l5_enabled()) {
+ pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
+ return;
+ }
+
+ rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
+ "hv_crash_nmi");
+ if (rc) {
+ pr_err("Hyper-V: failed to register crash nmi handler\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+ memset(input, 0, sizeof(*input));
+ input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
+
+ status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+ cda_info.as_uint64 = output->hv_cda_info.as_uint64;
+ local_irq_restore(flags);
+
+ if (!hv_result_success(status)) {
+ pr_err("Hyper-V: %s: property:%d %s\n", __func__,
+ input->property_id, hv_result_to_string(status));
+ goto err_out;
+ }
+
+ if (cda_info.base_pfn == 0) {
+ pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
+ goto err_out;
+ }
+
+ hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
+
+ rc = hv_crash_trampoline_setup();
+ if (rc)
+ goto err_out;
+
+ smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
+
+ crash_kexec_post_notifiers = true;
+ hv_crash_enabled = true;
+ pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
+
+ return;
+
+err_out:
+ unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
+ pr_err("Hyper-V: only linux root kdump support enabled\n");
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 085ef4f2e73a..14de43f4bc6c 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -170,6 +170,10 @@ static int hv_cpu_init(unsigned int cpu)
wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
+ /* Allow Hyper-V stimer vector to be injected from Hypervisor. */
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true);
+
return hyperv_init_ghcb();
}
@@ -277,6 +281,9 @@ static int hv_cpu_die(unsigned int cpu)
*ghcb_va = NULL;
}
+ if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+ apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false);
+
hv_common_cpu_die(cpu);
if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
@@ -551,6 +558,8 @@ void __init hyperv_init(void)
memunmap(src);
hv_remap_tsc_clocksource();
+ hv_root_crash_init();
+ hv_sleep_notifiers_register();
} else {
hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S
new file mode 100644
index 000000000000..25f02ff12286
--- /dev/null
+++ b/arch/x86/hyperv/hv_trampoline.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific Hyper-V kdump/crash related code.
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * void noreturn hv_crash_asm32(arg1)
+ * arg1 == edi == 32bit PA of struct hv_crash_tramp_data
+ *
+ * The hypervisor jumps here upon devirtualization in protected mode. This
+ * code gets copied to a page in the low 4G ie, 32bit space so it can run
+ * in the protected mode. Hence we cannot use any compile/link time offsets or
+ * addresses. It restores long mode via temporary gdt and page tables and
+ * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
+ *
+ * PreCondition (ie, Hypervisor call back ABI):
+ * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
+ * o CR4 is set to 0x0
+ * o IA32_EFER is set to 0x901 (SCE and NXE are set)
+ * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
+ * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
+ * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
+ * o LDTR is initialized as invalid (limit of 0)
+ * o MSR PAT is power on default.
+ * o Other state/registers are cleared. All TLBs flushed.
+ */
+
+#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */
+#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */
+#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */
+#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */
+#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */
+
+ .text
+ .code32
+
+SYM_CODE_START(hv_crash_asm32)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ movl $X86_CR4_PAE, %ecx
+ movl %ecx, %cr4
+
+ movl %edi, %ebx
+ add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
+ movl %cs:(%ebx), %eax
+ movl %eax, %cr3
+
+ /* Setup EFER for long mode now */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Turn paging on using the temp 32bit trampoline page table */
+ movl %cr0, %eax
+ orl $(X86_CR0_PG), %eax
+ movl %eax, %cr0
+
+ /* since kernel cr3 could be above 4G, we need to be in the long mode
+ * before we can load 64bits of the kernel cr3. We use a temp gdt for
+ * that with CS.L=1 and CS.D=0 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
+ lgdtl %cs:(%eax)
+
+ /* not done yet, restore CS now to switch to CS.L=1 */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
+ ljmp %cs:*(%eax)
+SYM_CODE_END(hv_crash_asm32)
+
+ /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
+ .code64
+ .balign 8
+SYM_CODE_START(hv_crash_asm64)
+ UNWIND_HINT_UNDEFINED
+ ENDBR
+ /* restore kernel page tables so we can jump to kernel code */
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_KERNCR3, %eax
+ movq %cs:(%eax), %rbx
+ movq %rbx, %cr3
+
+ mov %edi, %eax
+ add $HV_CRASHDATA_OFFS_C_entry, %eax
+ movq %cs:(%eax), %rbx
+ ANNOTATE_RETPOLINE_SAFE
+ jmp *%rbx
+
+ int $3
+
+SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL)
+SYM_CODE_END(hv_crash_asm64)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 042e8712d8de..c0edaed0efb3 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -9,12 +9,17 @@
#include <asm/apic.h>
#include <asm/boot.h>
#include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/types.h>
#include <asm/i8259.h>
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/realmode.h>
#include <asm/reboot.h>
+#include <asm/smap.h>
+#include <linux/export.h>
#include <../kernel/smpboot.h>
+#include "../../kernel/fpu/legacy.h"
extern struct boot_params boot_params;
static struct real_mode_header hv_vtl_real_mode_header;
@@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void)
return 0;
}
+
+DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
+
+void mshv_vtl_return_call_init(u64 vtl_return_offset)
+{
+ static_call_update(__mshv_vtl_return_hypercall,
+ (void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
+}
+EXPORT_SYMBOL(mshv_vtl_return_call_init);
+
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+{
+ struct hv_vp_assist_page *hvp;
+
+ hvp = hv_vp_assist_page[smp_processor_id()];
+ hvp->vtl_ret_x64rax = vtl0->rax;
+ hvp->vtl_ret_x64rcx = vtl0->rcx;
+
+ kernel_fpu_begin_mask(0);
+ fxrstor(&vtl0->fx_state);
+ __mshv_vtl_return_call(vtl0);
+ fxsave(&vtl0->fx_state);
+ kernel_fpu_end();
+}
+EXPORT_SYMBOL(mshv_vtl_return_call);
diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c
new file mode 100644
index 000000000000..882c1db6df16
--- /dev/null
+++ b/arch/x86/hyperv/mshv-asm-offsets.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include <asm/mshyperv.h>
+
+static void __used common(void)
+{
+ if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
+ OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
+ }
+}
diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S
new file mode 100644
index 000000000000..f595eefad9ab
--- /dev/null
+++ b/arch/x86/hyperv/mshv_vtl_asm.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Assembly level code for mshv_vtl VTL transition
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ * Naman Jain <namjain@microsoft.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/static_call_types.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include "mshv-asm-offsets.h"
+
+ .text
+ .section .noinstr.text, "ax"
+/*
+ * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+ *
+ * This function is used to context switch between different Virtual Trust Levels.
+ * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
+ * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
+ * against #PFs in NMI context clobbering the guest state.
+ */
+SYM_FUNC_START(__mshv_vtl_return_call)
+ /* Push callee save registers */
+ pushq %rbp
+ mov %rsp, %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rbx
+
+ /* register switch to VTL0 clobbers all registers except rax/rcx */
+ mov %_ASM_ARG1, %rax
+
+ /* grab rbx/rbp/rsi/rdi/r8-r15 */
+ mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
+ mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
+ mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
+ mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
+ mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
+ mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
+ mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
+ mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
+ mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
+ mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
+ mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
+ mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
+
+ mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
+ mov %rdx, %cr2
+ mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
+
+ /* stash host registers on stack */
+ pushq %rax
+ pushq %rcx
+
+ xor %ecx, %ecx
+
+ /* make a hypercall to switch VTL */
+ call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
+
+ /* stash guest registers on stack, restore saved host copies */
+ pushq %rax
+ pushq %rcx
+ mov 16(%rsp), %rcx
+ mov 24(%rsp), %rax
+
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
+ mov %cr2, %rdx
+ mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
+ pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
+ add $16, %rsp
+
+ /* save rbx/rbp/rsi/rdi/r8-r15 */
+ mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
+ mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
+ mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
+ mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
+ mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax)
+ mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax)
+ mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
+ mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
+ mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
+ mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
+ mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
+ mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
+
+ /* pop callee-save registers r12-r15, rbx */
+ pop %rbx
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+
+ pop %rbp
+ RET
+SYM_FUNC_END(__mshv_vtl_return_call)
+/*
+ * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
+ * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
+ * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
+ * which would otherwise have been generated by the macro.
+ */
+ .section .discard.addressable,"aw"
+ .align 8
+ .type mshv_vtl_return_sym, @object
+ .size mshv_vtl_return_sym, 8
+mshv_vtl_return_sym:
+ .quad __SCK____mshv_vtl_return_hypercall
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index b14c045679e1..03364510d5fe 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -197,8 +197,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
"773:\n"
#define ALTINSTR_ENTRY(ft_flags) \
- ".pushsection .altinstructions,\"a\"\n" \
- ANNOTATE_DATA_SPECIAL \
+ ".pushsection .altinstructions, \"aM\", @progbits, " \
+ __stringify(ALT_INSTR_SIZE) "\n" \
" .long 771b - .\n" /* label */ \
" .long 774f - .\n" /* new instruction */ \
" .4byte " __stringify(ft_flags) "\n" /* feature + flags */ \
@@ -208,7 +208,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define ALTINSTR_REPLACEMENT(newinstr) /* replacement */ \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ANNOTATE_DATA_SPECIAL \
+ ANNOTATE_DATA_SPECIAL "\n" \
"# ALT: replacement\n" \
"774:\n\t" newinstr "\n775:\n" \
".popsection\n"
@@ -339,7 +339,6 @@ void nop_func(void);
* instruction. See apply_alternatives().
*/
.macro altinstr_entry orig alt ft_flags orig_len alt_len
- ANNOTATE_DATA_SPECIAL
.long \orig - .
.long \alt - .
.4byte \ft_flags
@@ -363,7 +362,7 @@ void nop_func(void);
741: \
.skip -(((744f-743f)-(741b-740b)) > 0) * ((744f-743f)-(741b-740b)),0x90 ;\
742: \
- .pushsection .altinstructions,"a" ; \
+ .pushsection .altinstructions, "aM", @progbits, ALT_INSTR_SIZE ;\
altinstr_entry 740b,743f,flag,742b-740b,744f-743f ; \
.popsection ; \
.pushsection .altinstr_replacement,"ax" ; \
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index bd62bd87a841..0e8c611bc9e2 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -126,18 +126,21 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
#ifdef __KERNEL__
+#ifndef COMPILE_OFFSETS
+#include <asm/asm-offsets.h>
+#endif
+
# include <asm/extable_fixup_types.h>
/* Exception table entry */
#ifdef __ASSEMBLER__
-# define _ASM_EXTABLE_TYPE(from, to, type) \
- .pushsection "__ex_table","a" ; \
- .balign 4 ; \
- ANNOTATE_DATA_SPECIAL ; \
- .long (from) - . ; \
- .long (to) - . ; \
- .long type ; \
+# define _ASM_EXTABLE_TYPE(from, to, type) \
+ .pushsection "__ex_table", "aM", @progbits, EXTABLE_SIZE ; \
+ .balign 4 ; \
+ .long (from) - . ; \
+ .long (to) - . ; \
+ .long type ; \
.popsection
# ifdef CONFIG_KPROBES
@@ -180,18 +183,18 @@ static __always_inline __pure void *rip_rel_ptr(void *p)
".purgem extable_type_reg\n"
# define _ASM_EXTABLE_TYPE(from, to, type) \
- " .pushsection \"__ex_table\",\"a\"\n" \
+ " .pushsection __ex_table, \"aM\", @progbits, " \
+ __stringify(EXTABLE_SIZE) "\n" \
" .balign 4\n" \
- ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
" .long " __stringify(type) " \n" \
" .popsection\n"
# define _ASM_EXTABLE_TYPE_REG(from, to, type, reg) \
- " .pushsection \"__ex_table\",\"a\"\n" \
+ " .pushsection __ex_table, \"aM\", @progbits, " \
+ __stringify(EXTABLE_SIZE) "\n" \
" .balign 4\n" \
- ANNOTATE_DATA_SPECIAL \
" .long (" #from ") - .\n" \
" .long (" #to ") - .\n" \
DEFINE_EXTABLE_TYPE_REG \
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index ab5bba6cf7f5..9b4e04690e1a 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -15,7 +15,7 @@ extern void __WARN_trap(struct bug_entry *bug, ...);
/*
* Despite that some emulators terminate on UD2, we use it for WARN().
*/
-#define ASM_UD2 _ASM_BYTES(0x0f, 0x0b)
+#define ASM_UD2 __ASM_FORM(ud2)
#define INSN_UD2 0x0b0f
#define LEN_UD2 2
@@ -70,7 +70,7 @@ extern void __WARN_trap(struct bug_entry *bug, ...);
#define _BUG_FLAGS_ASM(format, file, line, flags, size, extra) \
".pushsection __bug_table,\"aw\"\n\t" \
- ANNOTATE_DATA_SPECIAL \
+ ANNOTATE_DATA_SPECIAL "\n\t" \
"2:\n\t" \
__BUG_ENTRY(format, file, line, flags) \
"\t.org 2b + " size "\n" \
@@ -129,7 +129,7 @@ do { \
#define __WARN_FLAGS(cond_str, flags) \
do { \
- __auto_type __flags = BUGFLAG_WARNING|(flags); \
+ auto __flags = BUGFLAG_WARNING|(flags); \
instrumentation_begin(); \
_BUG_FLAGS(cond_str, ASM_UD2, __flags, ARCH_WARN_REACHABLE); \
instrumentation_end(); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 4b1a6ade1700..3ddc1d33399b 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -101,7 +101,7 @@ static __always_inline bool _static_cpu_has(u16 bit)
asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]")
".pushsection .altinstr_aux,\"ax\"\n"
"6:\n"
- ANNOTATE_DATA_SPECIAL
+ ANNOTATE_DATA_SPECIAL "\n"
" testb %[bitnum], %a[cap_byte]\n"
" jnz %l[t_yes]\n"
" jmp %l[t_no]\n"
diff --git a/arch/x86/include/asm/cpumask.h b/arch/x86/include/asm/cpumask.h
index 70f6b60ad67b..9df9e9cde670 100644
--- a/arch/x86/include/asm/cpumask.h
+++ b/arch/x86/include/asm/cpumask.h
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_CPUMASK_H
#define _ASM_X86_CPUMASK_H
#ifndef __ASSEMBLER__
+
+#include <linux/compiler.h>
#include <linux/cpumask.h>
extern void setup_cpu_local_masks(void);
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9931e4c7d73f..30fd06ede751 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -60,6 +60,12 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
}
#define div_u64_rem div_u64_rem
+/*
+ * gcc tends to zero extend 32bit values and do full 64bit maths.
+ * Define asm functions that avoid this.
+ * (clang generates better code for the C versions.)
+ */
+#ifndef __clang__
static inline u64 mul_u32_u32(u32 a, u32 b)
{
u32 high, low;
@@ -71,6 +77,19 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
}
#define mul_u32_u32 mul_u32_u32
+static inline u64 add_u64_u32(u64 a, u32 b)
+{
+ u32 high = a >> 32, low = a;
+
+ asm ("addl %[b], %[low]; adcl $0, %[high]"
+ : [low] "+r" (low), [high] "+r" (high)
+ : [b] "rm" (b) );
+
+ return low | (u64)high << 32;
+}
+#define add_u64_u32 add_u64_u32
+#endif
+
/*
* __div64_32() is never called on x86, so prevent the
* generic definition from getting built.
@@ -84,21 +103,25 @@ static inline u64 mul_u32_u32(u32 a, u32 b)
* Will generate an #DE when the result doesn't fit u64, could fix with an
* __ex_table[] entry when it becomes an issue.
*/
-static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div)
+static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
{
- u64 q;
+ u64 rdx;
+
+ asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
+
+ if (!statically_true(!add))
+ asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
+ [lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
- asm ("mulq %2; divq %3" : "=a" (q)
- : "a" (a), "rm" (mul), "rm" (div)
- : "rdx");
+ asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
- return q;
+ return rax;
}
-#define mul_u64_u64_div_u64 mul_u64_u64_div_u64
+#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
{
- return mul_u64_u64_div_u64(a, mul, div);
+ return mul_u64_add_u64_div_u64(a, mul, 0, div);
}
#define mul_u64_u32_div mul_u64_u32_div
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 5a0d42464d44..4e55d1755846 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -87,4 +87,11 @@ static inline void panic_if_irq_remap(const char *msg)
}
#endif /* CONFIG_IRQ_REMAP */
+
+#ifdef CONFIG_X86_POSTED_MSI
+void intel_ack_posted_msi_irq(struct irq_data *irqd);
+#else
+#define intel_ack_posted_msi_irq NULL
+#endif
+
#endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 735c3a491f60..8325b79f2ac6 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -101,7 +101,7 @@
#define ASM_CALL_ARG0 \
"1: call %c[__func] \n" \
- ANNOTATE_REACHABLE(1b)
+ ANNOTATE_REACHABLE(1b) " \n"
#define ASM_CALL_ARG1 \
"movq %[arg1], %%rdi \n" \
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b30e5474c18e..a1193e9d65f2 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -25,7 +25,7 @@ extern __always_inline unsigned long native_save_fl(void)
*/
asm volatile("# __raw_save_flags\n\t"
"pushf ; pop %0"
- : "=rm" (flags)
+ : ASM_OUTPUT_RM (flags)
: /* no input */
: "memory");
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index e0a6930a4029..05b16299588d 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -15,7 +15,7 @@
#define JUMP_TABLE_ENTRY(key, label) \
".pushsection __jump_table, \"aw\" \n\t" \
_ASM_ALIGN "\n\t" \
- ANNOTATE_DATA_SPECIAL \
+ ANNOTATE_DATA_SPECIAL "\n" \
".long 1b - . \n\t" \
".long " label " - . \n\t" \
_ASM_PTR " " key " - . \n\t" \
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 605abd02158d..eef4c3a5ba28 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,7 @@
#include <asm/paravirt.h>
#include <asm/msr.h>
#include <hyperv/hvhdk.h>
+#include <asm/fpu/types.h>
/*
* Hyper-V always provides a single IO-APIC at this MMIO address.
@@ -176,6 +177,8 @@ int hyperv_flush_guest_mapping_range(u64 as,
int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
+void hv_sleep_notifiers_register(void);
+void hv_machine_power_off(void);
#ifdef CONFIG_X86_64
void hv_apic_init(void);
@@ -237,6 +240,15 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
}
int hv_apicid_to_vp_index(u32 apic_id);
+#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP)
+void hv_root_crash_init(void);
+void hv_crash_asm32(void);
+void hv_crash_asm64(void);
+void hv_crash_asm_end(void);
+#else /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+static inline void hv_root_crash_init(void) {}
+#endif /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
@@ -260,13 +272,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
#endif /* CONFIG_HYPERV */
+struct mshv_vtl_cpu_context {
+ union {
+ struct {
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 cr2;
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ };
+ u64 gp_regs[16];
+ };
+
+ struct fxregs_state fx_state;
+};
#ifdef CONFIG_HYPERV_VTL_MODE
void __init hv_vtl_init_platform(void);
int __init hv_vtl_early_init(void);
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+void mshv_vtl_return_call_init(u64 vtl_return_offset);
+void mshv_vtl_return_hypercall(void);
+void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
#else
static inline void __init hv_vtl_init_platform(void) {}
static inline int __init hv_vtl_early_init(void) { return 0; }
+static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
+static inline void mshv_vtl_return_hypercall(void) {}
+static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
#endif
#include <asm-generic/mshyperv.h>
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index a6526c5be5ca..4f4b5e8a1574 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -466,7 +466,7 @@ static inline void call_depth_return_thunk(void) {}
*/
# define CALL_NOSPEC \
ALTERNATIVE_2( \
- ANNOTATE_RETPOLINE_SAFE \
+ ANNOTATE_RETPOLINE_SAFE "\n" \
"call *%[thunk_target]\n", \
" jmp 904f;\n" \
" .align 16\n" \
@@ -482,7 +482,7 @@ static inline void call_depth_return_thunk(void) {}
"904: call 901b;\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
- ANNOTATE_RETPOLINE_SAFE \
+ ANNOTATE_RETPOLINE_SAFE "\n" \
"call *%[thunk_target]\n", \
X86_FEATURE_RETPOLINE_LFENCE)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 37a8627d8277..3502939415ad 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -249,7 +249,7 @@ extern struct paravirt_patch_template pv_ops;
* don't need to bother with CFI prefixes.
*/
#define PARAVIRT_CALL \
- ANNOTATE_RETPOLINE_SAFE \
+ ANNOTATE_RETPOLINE_SAFE "\n\t" \
"call *%[paravirt_opptr];"
/*
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 20a3baae9568..977bef14a0ab 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -77,7 +77,7 @@ static __always_inline unsigned long smap_save(void)
unsigned long flags;
asm volatile ("# smap_save\n\t"
- ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
+ ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
"", "pushf; pop %0; clac",
X86_FEATURE_SMAP)
: "=rm" (flags) : : "memory", "cc");
@@ -88,7 +88,7 @@ static __always_inline unsigned long smap_save(void)
static __always_inline void smap_restore(unsigned long flags)
{
asm volatile ("# smap_restore\n\t"
- ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE
+ ALTERNATIVE(ANNOTATE_IGNORE_ALTERNATIVE "\n\t"
"", "push %0; popf",
X86_FEATURE_SMAP)
: : "g" (flags) : "memory", "cc");
@@ -101,9 +101,9 @@ static __always_inline void smap_restore(unsigned long flags)
ALTERNATIVE("", "stac", X86_FEATURE_SMAP)
#define ASM_CLAC_UNSAFE \
- ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "clac", X86_FEATURE_SMAP)
+ ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "clac", X86_FEATURE_SMAP)
#define ASM_STAC_UNSAFE \
- ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "stac", X86_FEATURE_SMAP)
+ ALTERNATIVE("", ANNOTATE_IGNORE_ALTERNATIVE "\n\t" "stac", X86_FEATURE_SMAP)
#endif /* __ASSEMBLER__ */
diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h
index 41502bd2afd6..4cd725a8fe91 100644
--- a/arch/x86/include/asm/static_call.h
+++ b/arch/x86/include/asm/static_call.h
@@ -36,7 +36,7 @@
".align 4 \n" \
".globl " STATIC_CALL_TRAMP_STR(name) " \n" \
STATIC_CALL_TRAMP_STR(name) ": \n" \
- ANNOTATE_NOENDBR \
+ ANNOTATE_NOENDBR " \n" \
insns " \n" \
".byte 0x0f, 0xb9, 0xcc \n" \
".type " STATIC_CALL_TRAMP_STR(name) ", @function \n" \
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 79e9695dc13e..4635616863f5 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -31,7 +31,7 @@ KCFI_REFERENCE(__memset);
#define __HAVE_ARCH_MEMSET16
static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
{
- const __auto_type s0 = s;
+ const auto s0 = s;
asm volatile (
"rep stosw"
: "+D" (s), "+c" (n)
@@ -44,7 +44,7 @@ static inline void *memset16(uint16_t *s, uint16_t v, size_t n)
#define __HAVE_ARCH_MEMSET32
static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
{
- const __auto_type s0 = s;
+ const auto s0 = s;
asm volatile (
"rep stosl"
: "+D" (s), "+c" (n)
@@ -57,7 +57,7 @@ static inline void *memset32(uint32_t *s, uint32_t v, size_t n)
#define __HAVE_ARCH_MEMSET64
static inline void *memset64(uint64_t *s, uint64_t v, size_t n)
{
- const __auto_type s0 = s;
+ const auto s0 = s;
asm volatile (
"rep stosq"
: "+D" (s), "+c" (n)
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 641f45c22f9d..915124011c27 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -72,7 +72,7 @@ static inline void __user *mask_user_address(const void __user *ptr)
return ret;
}
#define masked_user_access_begin(x) ({ \
- __auto_type __masked_ptr = (x); \
+ auto __masked_ptr = (x); \
__masked_ptr = mask_user_address(__masked_ptr); \
__uaccess_begin(); __masked_ptr; })
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 6989b824fd32..d0b62e255290 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -122,7 +122,7 @@ struct uv_systab {
struct {
u32 type:8; /* type of entry */
u32 offset:24; /* byte offset from struct start to entry */
- } entry[1]; /* additional entries follow */
+ } entry[]; /* additional entries follow */
};
extern struct uv_systab *uv_systab;
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 36698cc9fb44..6c8a6ead84f6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -79,7 +79,7 @@ struct x86_init_paging {
/**
* struct x86_init_timers - platform specific timer setup
- * @setup_perpcu_clockev: set up the per cpu clock event device for the
+ * @setup_percpu_clockev: set up the per cpu clock event device for the
* boot cpu
* @timer_init: initialize the platform timer (default PIT/HPET)
* @wallclock_init: init the wallclock device
@@ -132,7 +132,7 @@ struct x86_hyper_init {
/**
* struct x86_init_acpi - x86 ACPI init functions
- * @set_root_poitner: set RSDP address
+ * @set_root_pointer: set RSDP address
* @get_root_pointer: get RSDP address
* @reduced_hw_early_init: hardware reduced platform early init
*/
@@ -145,14 +145,14 @@ struct x86_init_acpi {
/**
* struct x86_guest - Functions used by misc guest incarnations like SEV, TDX, etc.
*
- * @enc_status_change_prepare Notify HV before the encryption status of a range is changed
- * @enc_status_change_finish Notify HV after the encryption status of a range is changed
- * @enc_tlb_flush_required Returns true if a TLB flush is needed before changing page encryption status
- * @enc_cache_flush_required Returns true if a cache flush is needed before changing page encryption status
- * @enc_kexec_begin Begin the two-step process of converting shared memory back
+ * @enc_status_change_prepare: Notify HV before the encryption status of a range is changed
+ * @enc_status_change_finish: Notify HV after the encryption status of a range is changed
+ * @enc_tlb_flush_required: Returns true if a TLB flush is needed before changing page encryption status
+ * @enc_cache_flush_required: Returns true if a cache flush is needed before changing page encryption status
+ * @enc_kexec_begin: Begin the two-step process of converting shared memory back
* to private. It stops the new conversions from being started
* and waits in-flight conversions to finish, if possible.
- * @enc_kexec_finish Finish the two-step process of converting shared memory to
+ * @enc_kexec_finish: Finish the two-step process of converting shared memory to
* private. All memory is private after the call when
* the function returns.
* It is called on only one CPU while the others are shut down
@@ -229,7 +229,7 @@ struct x86_legacy_devices {
* given platform/subarch.
* @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller
* is absent.
- * @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be
+ * @X86_LEGACY_I8042_EXPECTED_PRESENT: the controller is likely to be
* present, the i8042 driver should probe for controller existence.
*/
enum x86_legacy_i8042_state {
@@ -244,6 +244,8 @@ enum x86_legacy_i8042_state {
* @i8042: indicated if we expect the device to have i8042 controller
* present.
* @rtc: this device has a CMOS real-time clock present
+ * @warm_reset: 1 if platform allows warm reset, else 0
+ * @no_vga: 1 if (FADT.boot_flags & ACPI_FADT_NO_VGA) is set, else 0
* @reserve_bios_regions: boot code will search for the EBDA address and the
* start of the 640k - 1M BIOS region. If false, the platform must
* ensure that its memory map correctly reserves sub-1MB regions as needed.
@@ -290,9 +292,10 @@ struct x86_hyper_runtime {
* @calibrate_tsc: calibrate TSC, if different from CPU
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
- * @is_untracked_pat_range exclude from PAT logic
- * @nmi_init enable NMI on cpus
- * @get_nmi_reason get the reason an NMI was received
+ * @iommu_shutdown: set by an IOMMU driver for shutdown if necessary
+ * @is_untracked_pat_range: exclude from PAT logic
+ * @nmi_init: enable NMI on cpus
+ * @get_nmi_reason: get the reason an NMI was received
* @save_sched_clock_state: save state for sched_clock() on suspend
* @restore_sched_clock_state: restore state for sched_clock() on resume
* @apic_post_init: adjust apic if needed
@@ -307,6 +310,7 @@ struct x86_hyper_runtime {
* @realmode_reserve: reserve memory for realmode trampoline
* @realmode_init: initialize realmode trampoline
* @hyper: x86 hypervisor specific runtime callbacks
+ * @guest: guest incarnations callbacks
*/
struct x86_platform_ops {
unsigned long (*calibrate_cpu)(void);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 74f4c659f9c9..28518371d8bf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2229,7 +2229,7 @@ asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_selftest_asm, @function\n"
"int3_selftest_asm:\n"
- ANNOTATE_NOENDBR
+ ANNOTATE_NOENDBR "\n"
/*
* INT3 padded with NOP to CALL_INSN_SIZE. The INT3 triggers an
* exception, then the int3_exception_nb notifier emulates a call to
@@ -2247,7 +2247,7 @@ asm (
" .pushsection .init.text, \"ax\", @progbits\n"
" .type int3_selftest_callee, @function\n"
"int3_selftest_callee:\n"
- ANNOTATE_NOENDBR
+ ANNOTATE_NOENDBR "\n"
" movl $0x1234, (%" _ASM_ARG1 ")\n"
ASM_RET
" .size int3_selftest_callee, . - int3_selftest_callee\n"
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e6e68a31634c..e8000a56732e 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -222,13 +222,14 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
}
/* Map a single area into the IOMMU */
-static dma_addr_t gart_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
+static dma_addr_t gart_map_phys(struct device *dev, phys_addr_t paddr,
+ size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
unsigned long bus;
- phys_addr_t paddr = page_to_phys(page) + offset;
+
+ if (unlikely(attrs & DMA_ATTR_MMIO))
+ return DMA_MAPPING_ERROR;
if (!need_iommu(dev, paddr, size))
return paddr;
@@ -242,7 +243,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page,
/*
* Free a DMA mapping.
*/
-static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+static void gart_unmap_phys(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -282,7 +283,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
for_each_sg(sg, s, nents, i) {
if (!s->dma_length || !s->length)
break;
- gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0);
+ gart_unmap_phys(dev, s->dma_address, s->dma_length, dir, 0);
}
}
@@ -487,7 +488,7 @@ static void
gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, unsigned long attrs)
{
- gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
+ gart_unmap_phys(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0);
dma_direct_free(dev, size, vaddr, dma_addr, attrs);
}
@@ -672,8 +673,8 @@ static __init int init_amd_gatt(struct agp_kern_info *info)
static const struct dma_map_ops gart_dma_ops = {
.map_sg = gart_map_sg,
.unmap_sg = gart_unmap_sg,
- .map_page = gart_map_page,
- .unmap_page = gart_unmap_page,
+ .map_phys = gart_map_phys,
+ .unmap_phys = gart_unmap_phys,
.alloc = gart_alloc_coherent,
.free = gart_free_coherent,
.mmap = dma_common_mmap,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 32ba599a51f8..25fcde525c68 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -124,4 +124,7 @@ static void __used common(void)
OFFSET(ARIA_CTX_rounds, aria_ctx, rounds);
#endif
+ BLANK();
+ DEFINE(ALT_INSTR_SIZE, sizeof(struct alt_instr));
+ DEFINE(EXTABLE_SIZE, sizeof(struct exception_table_entry));
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 6297416647ed..34440021e8cf 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -45,6 +45,7 @@
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <linux/kexec.h>
+#include <linux/vmcore_info.h>
#include <asm/fred.h>
#include <asm/cpu_device_id.h>
@@ -1729,6 +1730,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
out:
+ /* Given it didn't panic, mark it as recoverable */
+ hwerr_log_error_type(HWERR_RECOV_OTHERS);
+
instrumentation_end();
clear:
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3821a985f4ff..46673530bc6f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -258,7 +258,7 @@ static bool cpu_has_entrysign(void)
if (fam == 0x1a) {
if (model <= 0x2f ||
(0x40 <= model && model <= 0x4f) ||
- (0x60 <= model && model <= 0x6f))
+ (0x60 <= model && model <= 0x7f))
return true;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c4febdbcfe4d..579fb2c64cfd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -28,9 +28,9 @@
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/reboot.h>
+#include <asm/msr.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
-#include <asm/msr.h>
#include <asm/numa.h>
#include <asm/svm.h>
@@ -39,6 +39,12 @@ bool hv_nested;
struct ms_hyperv_info ms_hyperv;
#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
static inline unsigned int hv_get_nested_msr(unsigned int reg)
{
if (hv_is_sint_msr(reg))
@@ -75,17 +81,51 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
void hv_set_non_nested_msr(unsigned int reg, u64 value)
{
if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+ /* The hypervisor will get the intercept. */
hv_ivm_msr_write(reg, value);
- /* Write proxy bit via wrmsl instruction */
- if (hv_is_sint_msr(reg))
- wrmsrq(reg, value | 1 << 20);
+ /* Using wrmsrq so the following goes to the paravisor. */
+ if (hv_is_sint_msr(reg)) {
+ union hv_synic_sint sint = { .as_uint64 = value };
+
+ sint.proxy = hv_para_sint_proxy;
+ native_wrmsrq(reg, sint.as_uint64);
+ }
} else {
- wrmsrq(reg, value);
+ native_wrmsrq(reg, value);
}
}
EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+ hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return ~0ULL;
+ return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+ if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+ return;
+ native_write_msr(reg, val);
+}
+
u64 hv_get_msr(unsigned int reg)
{
if (hv_nested)
@@ -215,7 +255,7 @@ static void hv_machine_shutdown(void)
#endif /* CONFIG_KEXEC_CORE */
#ifdef CONFIG_CRASH_DUMP
-static void hv_machine_crash_shutdown(struct pt_regs *regs)
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
@@ -440,7 +480,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
static void __init ms_hyperv_init_platform(void)
{
- int hv_max_functions_eax;
+ int hv_max_functions_eax, eax;
#ifdef CONFIG_PARAVIRT
pv_info.name = "Hyper-V";
@@ -470,11 +510,27 @@ static void __init ms_hyperv_init_platform(void)
hv_identify_partition_type();
+ if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+ ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
hv_nested = true;
pr_info("Hyper-V: running on a nested hypervisor\n");
}
+ /*
+ * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+ * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+ * root) will not get them because the nested (L1) hypervisor filters them out.
+ * These are handled through intercept processing by the Windows Hyper-V stack
+ * or the paravisor.
+ */
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ ms_hyperv.confidential_vmbus_available =
+ eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+ ms_hyperv.msi_ext_dest_id =
+ eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
x86_platform.calibrate_tsc = hv_get_tsc_khz;
@@ -565,11 +621,14 @@ static void __init ms_hyperv_init_platform(void)
#endif
#if IS_ENABLED(CONFIG_HYPERV)
+ if (hv_root_partition())
+ machine_ops.power_off = hv_machine_power_off;
#if defined(CONFIG_KEXEC_CORE)
machine_ops.shutdown = hv_machine_shutdown;
#endif
#if defined(CONFIG_CRASH_DUMP)
- machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+ if (!hv_root_partition())
+ machine_ops.crash_shutdown = hv_guest_crash_shutdown;
#endif
#endif
/*
@@ -675,21 +734,10 @@ static bool __init ms_hyperv_x2apic_available(void)
* pci-hyperv host bridge.
*
* Note: for a Hyper-V root partition, this will always return false.
- * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
- * default, they are implemented as intercepts by the Windows Hyper-V stack.
- * Even a nested root partition (L2 root) will not get them because the
- * nested (L1) hypervisor filters them out.
*/
static bool __init ms_hyperv_msi_ext_dest_id(void)
{
- u32 eax;
-
- eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
- if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
- return false;
-
- eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
- return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+ return ms_hyperv.msi_ext_dest_id;
}
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 66f1efa16fbb..9322a9287dc7 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -242,7 +242,7 @@ static int __sgx_encl_add_page(struct sgx_encl *encl,
/*
* If the caller requires measurement of the page as a proof for the content,
* use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
- * operation until the entire page is measured."
+ * operation until the entire page is measured.
*/
static int __sgx_encl_extend(struct sgx_encl *encl,
struct sgx_epc_page *epc_page)
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 48113c5193aa..76153dfb58c9 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1946,7 +1946,7 @@ static int dump_xsave_layout_desc(struct coredump_params *cprm)
};
if (!dump_emit(cprm, &xc, sizeof(xc)))
- return 0;
+ return -1;
num_records++;
}
@@ -1984,7 +1984,7 @@ int elf_coredump_extra_notes_write(struct coredump_params *cprm)
return 1;
num_records = dump_xsave_layout_desc(cprm);
- if (!num_records)
+ if (num_records < 0)
return 1;
/* Total size should be equal to the number of records */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 86f4e574de02..b2fe6181960c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -397,6 +397,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+static DEFINE_PER_CPU_CACHE_HOT(bool, posted_msi_handler_active);
void intel_posted_msi_init(void)
{
@@ -414,6 +415,25 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
+void intel_ack_posted_msi_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+
+ /*
+ * Handle the rare case that irq_retrigger() raised the actual
+ * assigned vector on the target CPU, which means that it was not
+ * invoked via the posted MSI handler below. In that case APIC EOI
+ * is required as otherwise the ISR entry becomes stale and lower
+ * priority interrupts are never going to be delivered after that.
+ *
+ * If the posted handler invoked the device interrupt handler then
+ * the EOI would be premature because it would acknowledge the
+ * posted vector.
+ */
+ if (unlikely(!__this_cpu_read(posted_msi_handler_active)))
+ apic_eoi();
+}
+
static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
{
unsigned long pir_copy[NR_PIR_WORDS];
@@ -446,6 +466,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
pid = this_cpu_ptr(&posted_msi_pi_desc);
+ /* Mark the handler active for intel_ack_posted_msi_irq() */
+ __this_cpu_write(posted_msi_handler_active, true);
inc_irq_stat(posted_msi_notification_count);
irq_enter();
@@ -474,6 +496,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
apic_eoi();
irq_exit();
+ __this_cpu_write(posted_msi_handler_active, false);
set_irq_regs(old_regs);
}
#endif /* X86_POSTED_MSI */
diff --git a/arch/x86/kernel/rethook.c b/arch/x86/kernel/rethook.c
index 8a1c0111ae79..85e2f2d16a90 100644
--- a/arch/x86/kernel/rethook.c
+++ b/arch/x86/kernel/rethook.c
@@ -25,7 +25,7 @@ asm(
".type arch_rethook_trampoline, @function\n"
"arch_rethook_trampoline:\n"
#ifdef CONFIG_X86_64
- ANNOTATE_NOENDBR /* This is only jumped from ret instruction */
+ ANNOTATE_NOENDBR "\n" /* This is only jumped from ret instruction */
/* Push a fake return address to tell the unwinder it's a rethook. */
" pushq $arch_rethook_trampoline\n"
UNWIND_HINT_FUNC
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 2892cdb14563..61592e41a6b1 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -50,8 +50,8 @@ asm (".global __static_call_return\n\t"
".type __static_call_return, @function\n\t"
ASM_FUNC_ALIGN "\n\t"
"__static_call_return:\n\t"
- ANNOTATE_NOENDBR
- ANNOTATE_RETPOLINE_SAFE
+ ANNOTATE_NOENDBR "\n\t"
+ ANNOTATE_RETPOLINE_SAFE "\n\t"
"ret; int3\n\t"
".size __static_call_return, . - __static_call_return \n\t");
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 977ee75e047c..f610fde2d5c4 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -2,6 +2,7 @@
#include <linux/objtool.h>
#include <linux/module.h>
#include <linux/sort.h>
+#include <linux/bpf.h>
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
#include <asm/unwind.h>
@@ -172,6 +173,25 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
}
#endif
+/* Fake frame pointer entry -- used as a fallback for generated code */
+static struct orc_entry orc_fp_entry = {
+ .type = ORC_TYPE_CALL,
+ .sp_reg = ORC_REG_BP,
+ .sp_offset = 16,
+ .bp_reg = ORC_REG_PREV_SP,
+ .bp_offset = -16,
+};
+
+static struct orc_entry *orc_bpf_find(unsigned long ip)
+{
+#ifdef CONFIG_BPF_JIT
+ if (bpf_has_frame_pointer(ip))
+ return &orc_fp_entry;
+#endif
+
+ return NULL;
+}
+
/*
* If we crash with IP==0, the last successfully executed instruction
* was probably an indirect function call with a NULL function pointer,
@@ -186,15 +206,6 @@ static struct orc_entry null_orc_entry = {
.type = ORC_TYPE_CALL
};
-/* Fake frame pointer entry -- used as a fallback for generated code */
-static struct orc_entry orc_fp_entry = {
- .type = ORC_TYPE_CALL,
- .sp_reg = ORC_REG_BP,
- .sp_offset = 16,
- .bp_reg = ORC_REG_PREV_SP,
- .bp_offset = -16,
-};
-
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
@@ -238,6 +249,11 @@ static struct orc_entry *orc_find(unsigned long ip)
if (orc)
return orc;
+ /* BPF lookup: */
+ orc = orc_bpf_find(ip);
+ if (orc)
+ return orc;
+
return orc_ftrace_find(ip);
}
@@ -495,9 +511,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (!orc) {
/*
* As a fallback, try to assume this code uses a frame pointer.
- * This is useful for generated code, like BPF, which ORC
- * doesn't know about. This is just a guess, so the rest of
- * the unwind is no longer considered reliable.
+ * This is just a guess, so the rest of the unwind is no longer
+ * considered reliable.
*/
orc = &orc_fp_entry;
state->error = true;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index d563a948318b..88a5426674a1 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -510,10 +510,17 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
int r;
/*
+ * Apply pending runtime CPUID updates to the current CPUID entries to
+ * avoid false positives due to mismatches on KVM-owned feature flags.
+ */
+ if (vcpu->arch.cpuid_dynamic_bits_dirty)
+ kvm_update_cpuid_runtime(vcpu);
+
+ /*
* Swap the existing (old) entries with the incoming (new) entries in
* order to massage the new entries, e.g. to account for dynamic bits
- * that KVM controls, without clobbering the current guest CPUID, which
- * KVM needs to preserve in order to unwind on failure.
+ * that KVM controls, without losing the current guest CPUID, which KVM
+ * needs to preserve in order to unwind on failure.
*
* Similarly, save the vCPU's current cpu_caps so that the capabilities
* can be updated alongside the CPUID entries when performing runtime
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index c81005b24522..ba0f11c68372 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -985,7 +985,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
if (!nested_vmcb_check_save(vcpu) ||
!nested_vmcb_check_controls(vcpu)) {
vmcb12->control.exit_code = SVM_EXIT_ERR;
- vmcb12->control.exit_code_hi = 0;
+ vmcb12->control.exit_code_hi = -1u;
vmcb12->control.exit_info_1 = 0;
vmcb12->control.exit_info_2 = 0;
goto out;
@@ -1018,7 +1018,7 @@ out_exit_err:
svm->soft_int_injected = false;
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_code_hi = -1u;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f56c2d895011..24d59ccfa40d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2443,6 +2443,7 @@ static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu,
if (cr0 ^ val) {
svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+ svm->vmcb->control.exit_code_hi = 0;
ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
}
@@ -4617,6 +4618,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
if (static_cpu_has(X86_FEATURE_NRIPS))
vmcb->control.next_rip = info->next_rip;
vmcb->control.exit_code = icpt_info.exit_code;
+ vmcb->control.exit_code_hi = 0;
vmexit = nested_svm_exit_handled(svm);
ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9e151dbdef25..01be93a53d07 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -761,9 +761,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm);
static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code)
{
- svm->vmcb->control.exit_code = exit_code;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
+ svm->vmcb->control.exit_code = exit_code;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
return nested_svm_vmexit(svm);
}
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 40777278eabb..6137e5307d0f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -19,6 +19,7 @@
#include "trace.h"
#include "vmx.h"
#include "smm.h"
+#include "x86_ops.h"
static bool __read_mostly enable_shadow_vmcs = 1;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
@@ -5165,7 +5166,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (vmx->nested.update_vmcs01_apicv_status) {
vmx->nested.update_vmcs01_apicv_status = false;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ vmx_refresh_apicv_exec_ctrl(vcpu);
}
if (vmx->nested.update_vmcs01_hwapic_isr) {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4cbe8c84b636..6b96f7aea20b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6937,15 +6937,6 @@ void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
* VM-Exit, otherwise L1 with run with a stale SVI.
*/
if (is_guest_mode(vcpu)) {
- /*
- * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
- * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
- * Note, userspace can stuff state while L2 is active; assert
- * that VID is disabled if and only if the vCPU is in KVM_RUN
- * to avoid false positives if userspace is setting APIC state.
- */
- WARN_ON_ONCE(vcpu->wants_to_run &&
- nested_cpu_has_vid(get_vmcs12(vcpu)));
to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
return;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c6d899d53dd..ff8812f3a129 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10886,9 +10886,16 @@ void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
* pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
* still active when the interrupt got accepted. Make sure
* kvm_check_and_inject_events() is called to check for that.
+ *
+ * Update SVI when APICv gets enabled, otherwise SVI won't reflect the
+ * highest bit in vISR and the next accelerated EOI in the guest won't
+ * be virtualized correctly (the CPU uses SVI to determine which vISR
+ * vector to clear).
*/
if (!apic->apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else
+ kvm_apic_update_hwapic_isr(vcpu);
out:
preempt_enable();
diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c
index b5a6d83106bc..512a2538596f 100644
--- a/arch/x86/lib/error-inject.c
+++ b/arch/x86/lib/error-inject.c
@@ -13,7 +13,7 @@ asm(
".globl just_return_func\n"
ASM_FUNC_ALIGN
"just_return_func:\n"
- ANNOTATE_NOENDBR
+ ANNOTATE_NOENDBR "\n"
ASM_RET
".size just_return_func, .-just_return_func\n"
);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b69dc7194e2c..b0bac2a66eff 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1678,6 +1678,9 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
emit_prologue(&prog, image, stack_depth,
bpf_prog_was_classic(bpf_prog), tail_call_reachable,
bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb);
+
+ bpf_prog->aux->ksym.fp_start = prog - temp;
+
/* Exception callback will clobber callee regs for its own use, and
* restore the original callee regs from main prog's stack frame.
*/
@@ -2736,6 +2739,8 @@ emit_jmp:
pop_r12(&prog);
}
EMIT1(0xC9); /* leave */
+ bpf_prog->aux->ksym.fp_end = prog - temp;
+
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
break;
@@ -3325,6 +3330,9 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
}
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ if (im)
+ im->ksym.fp_start = prog - (u8 *)rw_image;
+
if (!is_imm8(stack_size)) {
/* sub rsp, stack_size */
EMIT3_off32(0x48, 0x81, 0xEC, stack_size);
@@ -3462,7 +3470,11 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, -rbx_off);
+
EMIT1(0xC9); /* leave */
+ if (im)
+ im->ksym.fp_end = prog - (u8 *)rw_image;
+
if (flags & BPF_TRAMP_F_SKIP_FRAME) {
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 4806cc28d7ca..b74ff8bc7f2a 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -108,7 +108,7 @@ static int xen_cpu_dead_pv(unsigned int cpu);
* calls.
*/
DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
-EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+EXPORT_PER_CPU_SYMBOL_GPL(xen_in_preemptible_hcall);
/*
* In case of scheduling the flag must be cleared and restored after