10 files changed, 1071 insertions, 21 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index d55f494f471d..56292102af62 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,8 +1,22 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y			:= hv_init.o mmu.o nested.o irqdomain.o ivm.o
 obj-$(CONFIG_X86_64)	+= hv_apic.o
-obj-$(CONFIG_HYPERV_VTL_MODE)	+= hv_vtl.o
+obj-$(CONFIG_HYPERV_VTL_MODE)	+= hv_vtl.o mshv_vtl_asm.o
+
+$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h
+
+$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE
+	$(call filechk,offsets,__MSHV_ASM_OFFSETS_H__)
 
 ifdef CONFIG_X86_64
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= hv_spinlock.o
+
+ ifdef CONFIG_MSHV_ROOT
+  CFLAGS_REMOVE_hv_trampoline.o += -pg
+  CFLAGS_hv_trampoline.o        += -fno-stack-protector
+  obj-$(CONFIG_CRASH_DUMP)      += hv_crash.o hv_trampoline.o
+ endif
 endif
+
+targets += mshv-asm-offsets.s
+clean-files += mshv-asm-offsets.h
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index bfde0a3498b9..a8de503def37 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -53,6 +53,11 @@ static void hv_apic_icr_write(u32 low, u32 id)
 	wrmsrq(HV_X64_MSR_ICR, reg_val);
 }
 
+void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set)
+{
+	apic_update_vector(cpu, vector, set);
+}
+
 static u32 hv_apic_read(u32 reg)
 {
 	u32 reg_val, hi;
@@ -293,6 +298,9 @@ static void hv_send_ipi_self(int vector)
 
 void __init hv_apic_init(void)
 {
+	if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+		return;
+
 	if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) {
 		pr_info("Hyper-V: Using IPI hypercalls\n");
 		/*
diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c
new file mode 100644
index 000000000000..c0e22921ace1
--- /dev/null
+++ b/arch/x86/hyperv/hv_crash.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * X86 specific Hyper-V root partition kdump/crash support module
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ * This module implements hypervisor RAM collection into vmcore for both
+ * cases of the hypervisor crash and Linux root crash. Hyper-V implements
+ * a disable hypercall with a 32bit protected mode ABI callback. This
+ * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM
+ * is already mapped in Linux, it is automatically collected into Linux vmcore,
+ * and can be examined by the crash command (raw RAM dump) or windbg.
+ *
+ * At a high level:
+ *
+ *  Hypervisor Crash:
+ *    Upon crash, hypervisor goes into an emergency minimal dispatch loop, a
+ *    restrictive mode with very limited hypercall and MSR support. Each cpu
+ *    then injects NMIs into root vcpus. A shared page is used to check
+ *    by Linux in the NMI handler if the hypervisor has crashed. This shared
+ *    page is setup in hv_root_crash_init during boot.
+ *
+ *  Linux Crash:
+ *    In case of Linux crash, the callback hv_crash_stop_other_cpus will send
+ *    NMIs to all cpus, then proceed to the crash_nmi_callback where it waits
+ *    for all cpus to be in NMI.
+ *
+ *  NMI Handler (upon quorum):
+ *    Eventually, in both cases, all cpus will end up in the NMI handler.
+ *    Hyper-V requires the disable hypervisor must be done from the BSP. So
+ *    the BSP NMI handler saves current context, does some fixups and makes
+ *    the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor
+ *    at that point will suspend all vcpus (except the BSP), unlock all its
+ *    RAM, and return to Linux at the 32bit mode entry RIP.
+ *
+ *  Linux 32bit entry trampoline will then restore long mode and call C
+ *  function here to restore context and continue execution to crash kexec.
+ */
+
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/crash_dump.h>
+#include <linux/panic.h>
+#include <asm/apic.h>
+#include <asm/desc.h>
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+#include <asm/mshyperv.h>
+#include <asm/nmi.h>
+#include <asm/idtentry.h>
+#include <asm/reboot.h>
+#include <asm/intel_pt.h>
+
+bool hv_crash_enabled;
+EXPORT_SYMBOL_GPL(hv_crash_enabled);
+
+struct hv_crash_ctxt {
+	ulong rsp;
+	ulong cr0;
+	ulong cr2;
+	ulong cr4;
+	ulong cr8;
+
+	u16 cs;
+	u16 ss;
+	u16 ds;
+	u16 es;
+	u16 fs;
+	u16 gs;
+
+	u16 gdt_fill;
+	struct desc_ptr gdtr;
+	char idt_fill[6];
+	struct desc_ptr idtr;
+
+	u64 gsbase;
+	u64 efer;
+	u64 pat;
+};
+static struct hv_crash_ctxt hv_crash_ctxt;
+
+/* Shared hypervisor page that contains crash dump area we peek into.
+ * NB: windbg looks for "hv_cda" symbol so don't change it.
+ */
+static struct hv_crashdump_area *hv_cda;
+
+static u32 trampoline_pa, devirt_arg;
+static atomic_t crash_cpus_wait;
+static void *hv_crash_ptpgs[4];
+static bool hv_has_crashed, lx_has_crashed;
+
+static void __noreturn hv_panic_timeout_reboot(void)
+{
+	#define PANIC_TIMER_STEP 100
+
+	if (panic_timeout > 0) {
+		int i;
+
+		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP)
+			mdelay(PANIC_TIMER_STEP);
+	}
+
+	if (panic_timeout)
+		native_wrmsrq(HV_X64_MSR_RESET, 1);    /* get hyp to reboot */
+
+	for (;;)
+		cpu_relax();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline __noclone void hv_crash_restore_tss(void)
+{
+	load_TR_desc();
+}
+
+/* This cannot be inlined as it needs stack */
+static noinline void hv_crash_clear_kernpt(void)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	/* Clear entry so it's not confusing to someone looking at the core */
+	pgd = pgd_offset_k(trampoline_pa);
+	p4d = p4d_offset(pgd, trampoline_pa);
+	native_p4d_clear(p4d);
+}
+
+/*
+ * This is the C entry point from the asm glue code after the disable hypercall.
+ * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel
+ * page tables with our below 4G page identity mapped, but using a temporary
+ * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not
+ * available. We restore kernel GDT, and rest of the context, and continue
+ * to kexec.
+ */
+static asmlinkage void __noreturn hv_crash_c_entry(void)
+{
+	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+	/* first thing, restore kernel gdt */
+	native_load_gdt(&ctxt->gdtr);
+
+	asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss));
+	asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp));
+
+	asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds));
+	asm volatile("movw %%ax, %%es" : : "a"(ctxt->es));
+	asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs));
+	asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs));
+
+	native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat);
+	asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0));
+
+	asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8));
+	asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4));
+	asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4));
+
+	native_load_idt(&ctxt->idtr);
+	native_wrmsrq(MSR_GS_BASE, ctxt->gsbase);
+	native_wrmsrq(MSR_EFER, ctxt->efer);
+
+	/* restore the original kernel CS now via far return */
+	asm volatile("movzwq %0, %%rax\n\t"
+		     "pushq %%rax\n\t"
+		     "pushq $1f\n\t"
+		     "lretq\n\t"
+		     "1:nop\n\t" : : "m"(ctxt->cs) : "rax");
+
+	/* We are in asmlinkage without stack frame, hence make C function
+	 * calls which will buy stack frames.
+	 */
+	hv_crash_restore_tss();
+	hv_crash_clear_kernpt();
+
+	/* we are now fully in devirtualized normal kernel mode */
+	__crash_kexec(NULL);
+
+	hv_panic_timeout_reboot();
+}
+/* Tell gcc we are using lretq long jump in the above function intentionally */
+STACK_FRAME_NON_STANDARD(hv_crash_c_entry);
+
+static void hv_mark_tss_not_busy(void)
+{
+	struct desc_struct *desc = get_current_gdt_rw();
+	tss_desc tss;
+
+	memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
+	tss.type = 0x9;        /* available 64-bit TSS. 0xB is busy TSS */
+	write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS);
+}
+
+/* Save essential context */
+static void hv_hvcrash_ctxt_save(void)
+{
+	struct hv_crash_ctxt *ctxt = &hv_crash_ctxt;
+
+	asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp));
+
+	ctxt->cr0 = native_read_cr0();
+	ctxt->cr4 = native_read_cr4();
+
+	asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2));
+	asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8));
+
+	asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs));
+	asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss));
+	asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds));
+	asm volatile("movl %%es, %%eax" : "=a"(ctxt->es));
+	asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs));
+	asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs));
+
+	native_store_gdt(&ctxt->gdtr);
+	store_idt(&ctxt->idtr);
+
+	ctxt->gsbase = __rdmsr(MSR_GS_BASE);
+	ctxt->efer = __rdmsr(MSR_EFER);
+	ctxt->pat = __rdmsr(MSR_IA32_CR_PAT);
+}
+
+/* Add trampoline page to the kernel pagetable for transition to kernel PT */
+static void hv_crash_fixup_kernpt(void)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	pgd = pgd_offset_k(trampoline_pa);
+	p4d = p4d_offset(pgd, trampoline_pa);
+
+	/* trampoline_pa is below 4G, so no pre-existing entry to clobber */
+	p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]);
+	p4d->p4d = p4d->p4d & ~(_PAGE_NX);    /* enable execute */
+}
+
+/*
+ * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce
+ * and suspend all guest VPs.
+ */
+static void hv_notify_prepare_hyp(void)
+{
+	u64 status;
+	struct hv_input_notify_partition_event *input;
+	struct hv_partition_event_root_crashdump_input *cda;
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	cda = &input->input.crashdump_input;
+	memset(input, 0, sizeof(*input));
+	input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP;
+
+	cda->crashdump_action = HV_CRASHDUMP_ENTRY;
+	status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+	if (!hv_result_success(status))
+		return;
+
+	cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS;
+	hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL);
+}
+
+/*
+ * Common function for all cpus before devirtualization.
+ *
+ * Hypervisor crash: all cpus get here in NMI context.
+ * Linux crash: the panicing cpu gets here at base level, all others in NMI
+ *		context. Note, panicing cpu may not be the BSP.
+ *
+ * The function is not inlined so it will show on the stack. It is named so
+ * because the crash cmd looks for certain well known function names on the
+ * stack before looking into the cpu saved note in the elf section, and
+ * that work is currently incomplete.
+ *
+ * Notes:
+ *  Hypervisor crash:
+ *    - the hypervisor is in a very restrictive mode at this point and any
+ *	vmexit it cannot handle would result in reboot. So, no mumbo jumbo,
+ *	just get to kexec as quickly as possible.
+ *
+ *  Devirtualization is supported from the BSP only at present.
+ */
+static noinline __noclone void crash_nmi_callback(struct pt_regs *regs)
+{
+	struct hv_input_disable_hyp_ex *input;
+	u64 status;
+	int msecs = 1000, ccpu = smp_processor_id();
+
+	if (ccpu == 0) {
+		/* crash_save_cpu() will be done in the kexec path */
+		cpu_emergency_stop_pt();	/* disable performance trace */
+		atomic_inc(&crash_cpus_wait);
+	} else {
+		crash_save_cpu(regs, ccpu);
+		cpu_emergency_stop_pt();	/* disable performance trace */
+		atomic_inc(&crash_cpus_wait);
+		for (;;)
+			cpu_relax();
+	}
+
+	while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--)
+		mdelay(1);
+
+	stop_nmi();
+	if (!hv_has_crashed)
+		hv_notify_prepare_hyp();
+
+	if (crashing_cpu == -1)
+		crashing_cpu = ccpu;		/* crash cmd uses this */
+
+	hv_hvcrash_ctxt_save();
+	hv_mark_tss_not_busy();
+	hv_crash_fixup_kernpt();
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+	input->rip = trampoline_pa;
+	input->arg = devirt_arg;
+
+	status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL);
+
+	hv_panic_timeout_reboot();
+}
+
+
+static DEFINE_SPINLOCK(hv_crash_reboot_lk);
+
+/*
+ * Generic NMI callback handler: could be called without any crash also.
+ *   hv crash: hypervisor injects NMI's into all cpus
+ *   lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus
+ */
+static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs)
+{
+	if (!hv_has_crashed && hv_cda && hv_cda->cda_valid)
+		hv_has_crashed = true;
+
+	if (!hv_has_crashed && !lx_has_crashed)
+		return NMI_DONE;	/* ignore the NMI */
+
+	if (hv_has_crashed && !kexec_crash_loaded()) {
+		if (spin_trylock(&hv_crash_reboot_lk))
+			hv_panic_timeout_reboot();
+		else
+			for (;;)
+				cpu_relax();
+	}
+
+	crash_nmi_callback(regs);
+
+	return NMI_DONE;
+}
+
+/*
+ * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus
+ *
+ * On normal Linux panic, this is called twice: first from panic and then again
+ * from native_machine_crash_shutdown.
+ *
+ * In case of hyperv, 3 ways to get here:
+ *  1. hv crash (only BSP will get here):
+ *	BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry
+ *		  -> __crash_kexec -> native_machine_crash_shutdown
+ *		  -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus
+ *  Linux panic:
+ *	2. panic cpu x: panic() -> crash_smp_send_stop
+ *				     -> smp_ops.crash_stop_other_cpus
+ *	3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop
+ *
+ * NB: noclone and non standard stack because of call to crash_setup_regs().
+ */
+static void __noclone hv_crash_stop_other_cpus(void)
+{
+	static bool crash_stop_done;
+	struct pt_regs lregs;
+	int ccpu = smp_processor_id();
+
+	if (hv_has_crashed)
+		return;		/* all cpus already in NMI handler path */
+
+	if (!kexec_crash_loaded()) {
+		hv_notify_prepare_hyp();
+		hv_panic_timeout_reboot();	/* no return */
+	}
+
+	/* If the hv crashes also, we could come here again before cpus_stopped
+	 * is set in crash_smp_send_stop(). So use our own check.
+	 */
+	if (crash_stop_done)
+		return;
+	crash_stop_done = true;
+
+	/* Linux has crashed: hv is healthy, we can IPI safely */
+	lx_has_crashed = true;
+	wmb();			/* NMI handlers look at lx_has_crashed */
+
+	apic->send_IPI_allbutself(NMI_VECTOR);
+
+	if (crashing_cpu == -1)
+		crashing_cpu = ccpu;		/* crash cmd uses this */
+
+	/* crash_setup_regs() happens in kexec also, but for the kexec cpu which
+	 * is the BSP. We could be here on non-BSP cpu, collect regs if so.
+	 */
+	if (ccpu)
+		crash_setup_regs(&lregs, NULL);
+
+	crash_nmi_callback(&lregs);
+}
+STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus);
+
+/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */
+struct hv_gdtreg_32 {
+	u16 fill;
+	u16 limit;
+	u32 address;
+} __packed;
+
+/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */
+struct hv_crash_tramp_gdt {
+	u64 null;	/* index 0, selector 0, null selector */
+	u64 cs64;	/* index 1, selector 8, cs64 selector */
+} __packed;
+
+/* No stack, so jump via far ptr in memory to load the 64bit CS */
+struct hv_cs_jmptgt {
+	u32 address;
+	u16 csval;
+	u16 fill;
+} __packed;
+
+/* Linux use only, hypervisor doesn't look at this struct */
+struct hv_crash_tramp_data {
+	u64 tramp32_cr3;
+	u64 kernel_cr3;
+	struct hv_gdtreg_32 gdtr32;
+	struct hv_crash_tramp_gdt tramp_gdt;
+	struct hv_cs_jmptgt cs_jmptgt;
+	u64 c_entry_addr;
+} __packed;
+
+/*
+ * Setup a temporary gdt to allow the asm code to switch to the long mode.
+ * Since the asm code is relocated/copied to a below 4G page, it cannot use rip
+ * relative addressing, hence we must use trampoline_pa here. Also, save other
+ * info like jmp and C entry targets for same reasons.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+static int hv_crash_setup_trampdata(u64 trampoline_va)
+{
+	int size, offs;
+	void *dest;
+	struct hv_crash_tramp_data *tramp;
+
+	/* These must match exactly the ones in the corresponding asm file */
+	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0);
+	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8);
+	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18);
+	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data,
+						     cs_jmptgt.address) != 40);
+	BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48);
+
+	/* hv_crash_asm_end is beyond last byte by 1 */
+	size = &hv_crash_asm_end - &hv_crash_asm32;
+	if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) {
+		pr_err("%s: trampoline page overflow\n", __func__);
+		return -1;
+	}
+
+	dest = (void *)trampoline_va;
+	memcpy(dest, &hv_crash_asm32, size);
+
+	dest += size;
+	dest = (void *)round_up((ulong)dest, 16);
+	tramp = (struct hv_crash_tramp_data *)dest;
+
+	/* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by
+	 * non-PCID-aware users". Build cr3 with pcid 0
+	 */
+	tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]);
+
+	/* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */
+	tramp->kernel_cr3 = __sme_pa(init_mm.pgd);
+
+	tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt);
+	tramp->gdtr32.address = trampoline_pa +
+				   (ulong)&tramp->tramp_gdt - trampoline_va;
+
+	 /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */
+	tramp->tramp_gdt.cs64 = 0x00af9a000000ffff;
+
+	tramp->cs_jmptgt.csval = 0x8;
+	offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32;
+	tramp->cs_jmptgt.address = trampoline_pa + offs;
+
+	tramp->c_entry_addr = (u64)&hv_crash_c_entry;
+
+	devirt_arg = trampoline_pa + (ulong)dest - trampoline_va;
+
+	return 0;
+}
+
+/*
+ * Build 32bit trampoline page table for transition from protected mode
+ * non-paging to long-mode paging. This transition needs pagetables below 4G.
+ */
+static void hv_crash_build_tramp_pt(void)
+{
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	u64 pa, addr = trampoline_pa;
+
+	p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d);
+	pa = virt_to_phys(hv_crash_ptpgs[1]);
+	set_p4d(p4d, __p4d(_PAGE_TABLE | pa));
+	p4d->p4d &= ~(_PAGE_NX);	/* enable execute */
+
+	pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud);
+	pa = virt_to_phys(hv_crash_ptpgs[2]);
+	set_pud(pud, __pud(_PAGE_TABLE | pa));
+
+	pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd);
+	pa = virt_to_phys(hv_crash_ptpgs[3]);
+	set_pmd(pmd, __pmd(_PAGE_TABLE | pa));
+
+	pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte);
+	set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+/*
+ * Setup trampoline for devirtualization:
+ *  - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to
+ *    in protected mode.
+ *  - 4 pages for a temporary page table that asm code uses to turn paging on
+ *  - a temporary gdt to use in the compat mode.
+ *
+ *  Returns: 0 on success
+ */
+static int hv_crash_trampoline_setup(void)
+{
+	int i, rc, order;
+	struct page *page;
+	u64 trampoline_va;
+	gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO;
+
+	/* page for 32bit trampoline assembly code + hv_crash_tramp_data */
+	page = alloc_page(flags32);
+	if (page == NULL) {
+		pr_err("%s: failed to alloc asm stub page\n", __func__);
+		return -1;
+	}
+
+	trampoline_va = (u64)page_to_virt(page);
+	trampoline_pa = (u32)page_to_phys(page);
+
+	order = 2;	   /* alloc 2^2 pages */
+	page = alloc_pages(flags32, order);
+	if (page == NULL) {
+		pr_err("%s: failed to alloc pt pages\n", __func__);
+		free_page(trampoline_va);
+		return -1;
+	}
+
+	for (i = 0; i < 4; i++, page++)
+		hv_crash_ptpgs[i] = page_to_virt(page);
+
+	hv_crash_build_tramp_pt();
+
+	rc = hv_crash_setup_trampdata(trampoline_va);
+	if (rc)
+		goto errout;
+
+	return 0;
+
+errout:
+	free_page(trampoline_va);
+	free_pages((ulong)hv_crash_ptpgs[0], order);
+
+	return rc;
+}
+
+/* Setup for kdump kexec to collect hypervisor RAM when running as root */
+void hv_root_crash_init(void)
+{
+	int rc;
+	struct hv_input_get_system_property *input;
+	struct hv_output_get_system_property *output;
+	unsigned long flags;
+	u64 status;
+	union hv_pfn_range cda_info;
+
+	if (pgtable_l5_enabled()) {
+		pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n");
+		return;
+	}
+
+	rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST,
+				  "hv_crash_nmi");
+	if (rc) {
+		pr_err("Hyper-V: failed to register crash nmi handler\n");
+		return;
+	}
+
+	local_irq_save(flags);
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	output = *this_cpu_ptr(hyperv_pcpu_output_arg);
+
+	memset(input, 0, sizeof(*input));
+	input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA;
+
+	status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output);
+	cda_info.as_uint64 = output->hv_cda_info.as_uint64;
+	local_irq_restore(flags);
+
+	if (!hv_result_success(status)) {
+		pr_err("Hyper-V: %s: property:%d %s\n", __func__,
+		       input->property_id, hv_result_to_string(status));
+		goto err_out;
+	}
+
+	if (cda_info.base_pfn == 0) {
+		pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n");
+		goto err_out;
+	}
+
+	hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT);
+
+	rc = hv_crash_trampoline_setup();
+	if (rc)
+		goto err_out;
+
+	smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus;
+
+	crash_kexec_post_notifiers = true;
+	hv_crash_enabled = true;
+	pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n");
+
+	return;
+
+err_out:
+	unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi");
+	pr_err("Hyper-V: only linux root kdump support enabled\n");
+}
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 085ef4f2e73a..14de43f4bc6c 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -170,6 +170,10 @@ static int hv_cpu_init(unsigned int cpu)
 		wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
 	}
 
+	/* Allow Hyper-V stimer vector to be injected from Hypervisor. */
+	if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+		apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true);
+
 	return hyperv_init_ghcb();
 }
 
@@ -277,6 +281,9 @@ static int hv_cpu_die(unsigned int cpu)
 		*ghcb_va = NULL;
 	}
 
+	if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
+		apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false);
+
 	hv_common_cpu_die(cpu);
 
 	if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
@@ -551,6 +558,8 @@ void __init hyperv_init(void)
 		memunmap(src);
 
 		hv_remap_tsc_clocksource();
+		hv_root_crash_init();
+		hv_sleep_notifiers_register();
 	} else {
 		hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg);
 		wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S
new file mode 100644
index 000000000000..25f02ff12286
--- /dev/null
+++ b/arch/x86/hyperv/hv_trampoline.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * X86 specific Hyper-V kdump/crash related code.
+ *
+ * Copyright (C) 2025, Microsoft, Inc.
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/alternative.h>
+#include <asm/msr.h>
+#include <asm/processor-flags.h>
+#include <asm/nospec-branch.h>
+
+/*
+ * void noreturn hv_crash_asm32(arg1)
+ *    arg1 == edi == 32bit PA of struct hv_crash_tramp_data
+ *
+ * The hypervisor jumps here upon devirtualization in protected mode. This
+ * code gets copied to a page in the low 4G ie, 32bit space so it can run
+ * in the protected mode. Hence we cannot use any compile/link time offsets or
+ * addresses. It restores long mode via temporary gdt and page tables and
+ * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry.
+ *
+ * PreCondition (ie, Hypervisor call back ABI):
+ *  o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled
+ *  o CR4 is set to 0x0
+ *  o IA32_EFER is set to 0x901 (SCE and NXE are set)
+ *  o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX.
+ *  o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF
+ *  o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF
+ *  o LDTR is initialized as invalid (limit of 0)
+ *  o MSR PAT is power on default.
+ *  o Other state/registers are cleared. All TLBs flushed.
+ */
+
+#define HV_CRASHDATA_OFFS_TRAMPCR3    0x0    /*  0 */
+#define HV_CRASHDATA_OFFS_KERNCR3     0x8    /*  8 */
+#define HV_CRASHDATA_OFFS_GDTRLIMIT  0x12    /* 18 */
+#define HV_CRASHDATA_OFFS_CS_JMPTGT  0x28    /* 40 */
+#define HV_CRASHDATA_OFFS_C_entry    0x30    /* 48 */
+
+	.text
+	.code32
+
+SYM_CODE_START(hv_crash_asm32)
+	UNWIND_HINT_UNDEFINED
+	ENDBR
+	movl	$X86_CR4_PAE, %ecx
+	movl	%ecx, %cr4
+
+	movl %edi, %ebx
+	add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx
+	movl %cs:(%ebx), %eax
+	movl %eax, %cr3
+
+	/* Setup EFER for long mode now */
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	btsl	$_EFER_LME, %eax
+	wrmsr
+
+	/* Turn paging on using the temp 32bit trampoline page table */
+	movl %cr0, %eax
+	orl $(X86_CR0_PG), %eax
+	movl %eax, %cr0
+
+	/* since kernel cr3 could be above 4G, we need to be in the long mode
+	 * before we can load 64bits of the kernel cr3. We use a temp gdt for
+	 * that with CS.L=1 and CS.D=0 */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax
+	lgdtl %cs:(%eax)
+
+	/* not done yet, restore CS now to switch to CS.L=1 */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax
+	ljmp %cs:*(%eax)
+SYM_CODE_END(hv_crash_asm32)
+
+	/* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */
+	.code64
+	.balign 8
+SYM_CODE_START(hv_crash_asm64)
+	UNWIND_HINT_UNDEFINED
+	ENDBR
+	/* restore kernel page tables so we can jump to kernel code */
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_KERNCR3, %eax
+	movq %cs:(%eax), %rbx
+	movq %rbx, %cr3
+
+	mov %edi, %eax
+	add $HV_CRASHDATA_OFFS_C_entry, %eax
+	movq %cs:(%eax), %rbx
+	ANNOTATE_RETPOLINE_SAFE
+	jmp *%rbx
+
+	int $3
+
+SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL)
+SYM_CODE_END(hv_crash_asm64)
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
index 042e8712d8de..c0edaed0efb3 100644
--- a/arch/x86/hyperv/hv_vtl.c
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -9,12 +9,17 @@
 #include <asm/apic.h>
 #include <asm/boot.h>
 #include <asm/desc.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/types.h>
 #include <asm/i8259.h>
 #include <asm/mshyperv.h>
 #include <asm/msr.h>
 #include <asm/realmode.h>
 #include <asm/reboot.h>
+#include <asm/smap.h>
+#include <linux/export.h>
 #include <../kernel/smpboot.h>
+#include "../../kernel/fpu/legacy.h"
 
 extern struct boot_params boot_params;
 static struct real_mode_header hv_vtl_real_mode_header;
@@ -249,3 +254,28 @@ int __init hv_vtl_early_init(void)
 
 	return 0;
 }
+
+DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void));
+
+void mshv_vtl_return_call_init(u64 vtl_return_offset)
+{
+	static_call_update(__mshv_vtl_return_hypercall,
+			   (void *)((u8 *)hv_hypercall_pg + vtl_return_offset));
+}
+EXPORT_SYMBOL(mshv_vtl_return_call_init);
+
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+{
+	struct hv_vp_assist_page *hvp;
+
+	hvp = hv_vp_assist_page[smp_processor_id()];
+	hvp->vtl_ret_x64rax = vtl0->rax;
+	hvp->vtl_ret_x64rcx = vtl0->rcx;
+
+	kernel_fpu_begin_mask(0);
+	fxrstor(&vtl0->fx_state);
+	__mshv_vtl_return_call(vtl0);
+	fxsave(&vtl0->fx_state);
+	kernel_fpu_end();
+}
+EXPORT_SYMBOL(mshv_vtl_return_call);
diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c
new file mode 100644
index 000000000000..882c1db6df16
--- /dev/null
+++ b/arch/x86/hyperv/mshv-asm-offsets.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ *   Naman Jain <namjain@microsoft.com>
+ */
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+#include <asm/mshyperv.h>
+
+static void __used common(void)
+{
+	if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) {
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r8,  mshv_vtl_cpu_context, r8);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r9,  mshv_vtl_cpu_context, r9);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15);
+		OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2);
+	}
+}
diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S
new file mode 100644
index 000000000000..f595eefad9ab
--- /dev/null
+++ b/arch/x86/hyperv/mshv_vtl_asm.S
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Assembly level code for mshv_vtl VTL transition
+ *
+ * Copyright (c) 2025, Microsoft Corporation.
+ *
+ * Author:
+ *   Naman Jain <namjain@microsoft.com>
+ */
+
+#include <linux/linkage.h>
+#include <linux/static_call_types.h>
+#include <asm/asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/frame.h>
+#include "mshv-asm-offsets.h"
+
+	.text
+	.section .noinstr.text, "ax"
+/*
+ * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0)
+ *
+ * This function is used to context switch between different Virtual Trust Levels.
+ * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities.
+ * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard
+ * against #PFs in NMI context clobbering the guest state.
+ */
+SYM_FUNC_START(__mshv_vtl_return_call)
+	/* Push callee save registers */
+	pushq %rbp
+	mov %rsp, %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushq %rbx
+
+	/* register switch to VTL0 clobbers all registers except rax/rcx */
+	mov %_ASM_ARG1, %rax
+
+	/* grab rbx/rbp/rsi/rdi/r8-r15 */
+	mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx
+	mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp
+	mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi
+	mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi
+	mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8
+	mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9
+	mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10
+	mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11
+	mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12
+	mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13
+	mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14
+	mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15
+
+	mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx
+	mov %rdx, %cr2
+	mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx
+
+	/* stash host registers on stack */
+	pushq %rax
+	pushq %rcx
+
+	xor %ecx, %ecx
+
+	/* make a hypercall to switch VTL */
+	call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall)
+
+	/* stash guest registers on stack, restore saved host copies */
+	pushq %rax
+	pushq %rcx
+	mov 16(%rsp), %rcx
+	mov 24(%rsp), %rax
+
+	mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax)
+	mov %cr2, %rdx
+	mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax)
+	pop MSHV_VTL_CPU_CONTEXT_rcx(%rax)
+	pop MSHV_VTL_CPU_CONTEXT_rax(%rax)
+	add $16, %rsp
+
+	/* save rbx/rbp/rsi/rdi/r8-r15 */
+	mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax)
+	mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax)
+	mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax)
+	mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax)
+	mov %r8,  MSHV_VTL_CPU_CONTEXT_r8(%rax)
+	mov %r9,  MSHV_VTL_CPU_CONTEXT_r9(%rax)
+	mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax)
+	mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax)
+	mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax)
+	mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax)
+	mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax)
+	mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax)
+
+	/* pop callee-save registers r12-r15, rbx */
+	pop %rbx
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+
+	pop %rbp
+	RET
+SYM_FUNC_END(__mshv_vtl_return_call)
+/*
+ * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here.
+ * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid
+ * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0"
+ * which would otherwise have been generated by the macro.
+ */
+	.section	.discard.addressable,"aw"
+	.align 8
+	.type	mshv_vtl_return_sym, @object
+	.size	mshv_vtl_return_sym, 8
+mshv_vtl_return_sym:
+	.quad	__SCK____mshv_vtl_return_hypercall
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 605abd02158d..eef4c3a5ba28 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -11,6 +11,7 @@
 #include <asm/paravirt.h>
 #include <asm/msr.h>
 #include <hyperv/hvhdk.h>
+#include <asm/fpu/types.h>
 
 /*
  * Hyper-V always provides a single IO-APIC at this MMIO address.
@@ -176,6 +177,8 @@ int hyperv_flush_guest_mapping_range(u64 as,
 int hyperv_fill_flush_guest_mapping_list(
 		struct hv_guest_mapping_flush_list *flush,
 		u64 start_gfn, u64 end_gfn);
+void hv_sleep_notifiers_register(void);
+void hv_machine_power_off(void);
 
 #ifdef CONFIG_X86_64
 void hv_apic_init(void);
@@ -237,6 +240,15 @@ static __always_inline u64 hv_raw_get_msr(unsigned int reg)
 }
 int hv_apicid_to_vp_index(u32 apic_id);
 
+#if IS_ENABLED(CONFIG_MSHV_ROOT) && IS_ENABLED(CONFIG_CRASH_DUMP)
+void hv_root_crash_init(void);
+void hv_crash_asm32(void);
+void hv_crash_asm64(void);
+void hv_crash_asm_end(void);
+#else   /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+static inline void hv_root_crash_init(void) {}
+#endif  /* CONFIG_MSHV_ROOT && CONFIG_CRASH_DUMP */
+
 #else /* CONFIG_HYPERV */
 static inline void hyperv_init(void) {}
 static inline void hyperv_setup_mmu_ops(void) {}
@@ -260,13 +272,46 @@ static inline u64 hv_get_non_nested_msr(unsigned int reg) { return 0; }
 static inline int hv_apicid_to_vp_index(u32 apic_id) { return -EINVAL; }
 #endif /* CONFIG_HYPERV */
 
+struct mshv_vtl_cpu_context {
+	union {
+		struct {
+			u64 rax;
+			u64 rcx;
+			u64 rdx;
+			u64 rbx;
+			u64 cr2;
+			u64 rbp;
+			u64 rsi;
+			u64 rdi;
+			u64 r8;
+			u64 r9;
+			u64 r10;
+			u64 r11;
+			u64 r12;
+			u64 r13;
+			u64 r14;
+			u64 r15;
+		};
+		u64 gp_regs[16];
+	};
+
+	struct fxregs_state fx_state;
+};
 
 #ifdef CONFIG_HYPERV_VTL_MODE
 void __init hv_vtl_init_platform(void);
 int __init hv_vtl_early_init(void);
+void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
+void mshv_vtl_return_call_init(u64 vtl_return_offset);
+void mshv_vtl_return_hypercall(void);
+void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0);
 #else
 static inline void __init hv_vtl_init_platform(void) {}
 static inline int __init hv_vtl_early_init(void) { return 0; }
+static inline void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
+static inline void mshv_vtl_return_call_init(u64 vtl_return_offset) {}
+static inline void mshv_vtl_return_hypercall(void) {}
+static inline void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) {}
 #endif
 
 #include <asm-generic/mshyperv.h>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c4febdbcfe4d..579fb2c64cfd 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -28,9 +28,9 @@
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/reboot.h>
+#include <asm/msr.h>
 #include <asm/nmi.h>
 #include <clocksource/hyperv_timer.h>
-#include <asm/msr.h>
 #include <asm/numa.h>
 #include <asm/svm.h>
 
@@ -39,6 +39,12 @@ bool hv_nested;
 struct ms_hyperv_info ms_hyperv;
 
 #if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * When running with the paravisor, controls proxying the synthetic interrupts
+ * from the host
+ */
+static bool hv_para_sint_proxy;
+
 static inline unsigned int hv_get_nested_msr(unsigned int reg)
 {
 	if (hv_is_sint_msr(reg))
@@ -75,17 +81,51 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_msr);
 void hv_set_non_nested_msr(unsigned int reg, u64 value)
 {
 	if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) {
+		/* The hypervisor will get the intercept. */
 		hv_ivm_msr_write(reg, value);
 
-		/* Write proxy bit via wrmsl instruction */
-		if (hv_is_sint_msr(reg))
-			wrmsrq(reg, value | 1 << 20);
+		/* Using wrmsrq so the following goes to the paravisor. */
+		if (hv_is_sint_msr(reg)) {
+			union hv_synic_sint sint = { .as_uint64 = value };
+
+			sint.proxy = hv_para_sint_proxy;
+			native_wrmsrq(reg, sint.as_uint64);
+		}
 	} else {
-		wrmsrq(reg, value);
+		native_wrmsrq(reg, value);
 	}
 }
 EXPORT_SYMBOL_GPL(hv_set_non_nested_msr);
 
+/*
+ * Enable or disable proxying synthetic interrupts
+ * to the paravisor.
+ */
+void hv_para_set_sint_proxy(bool enable)
+{
+	hv_para_sint_proxy = enable;
+}
+
+/*
+ * Get the SynIC register value from the paravisor.
+ */
+u64 hv_para_get_synic_register(unsigned int reg)
+{
+	if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+		return ~0ULL;
+	return native_read_msr(reg);
+}
+
+/*
+ * Set the SynIC register value with the paravisor.
+ */
+void hv_para_set_synic_register(unsigned int reg, u64 val)
+{
+	if (WARN_ON(!ms_hyperv.paravisor_present || !hv_is_synic_msr(reg)))
+		return;
+	native_write_msr(reg, val);
+}
+
 u64 hv_get_msr(unsigned int reg)
 {
 	if (hv_nested)
@@ -215,7 +255,7 @@ static void hv_machine_shutdown(void)
 #endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_CRASH_DUMP
-static void hv_machine_crash_shutdown(struct pt_regs *regs)
+static void hv_guest_crash_shutdown(struct pt_regs *regs)
 {
 	if (hv_crash_handler)
 		hv_crash_handler(regs);
@@ -440,7 +480,7 @@ EXPORT_SYMBOL_GPL(hv_get_hypervisor_version);
 
 static void __init ms_hyperv_init_platform(void)
 {
-	int hv_max_functions_eax;
+	int hv_max_functions_eax, eax;
 
 #ifdef CONFIG_PARAVIRT
 	pv_info.name = "Hyper-V";
@@ -470,11 +510,27 @@ static void __init ms_hyperv_init_platform(void)
 
 	hv_identify_partition_type();
 
+	if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC))
+		ms_hyperv.hints |= HV_DEPRECATING_AEOI_RECOMMENDED;
+
 	if (ms_hyperv.hints & HV_X64_HYPERV_NESTED) {
 		hv_nested = true;
 		pr_info("Hyper-V: running on a nested hypervisor\n");
 	}
 
+	/*
+	 * There is no check against the max function for HYPERV_CPUID_VIRT_STACK_* CPUID
+	 * leaves as the hypervisor doesn't handle them. Even a nested root partition (L2
+	 * root) will not get them because the nested (L1) hypervisor filters them out.
+	 * These are handled through intercept processing by the Windows Hyper-V stack
+	 * or the paravisor.
+	 */
+	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+	ms_hyperv.confidential_vmbus_available =
+		eax & HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE;
+	ms_hyperv.msi_ext_dest_id =
+		eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+
 	if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
 	    ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
 		x86_platform.calibrate_tsc = hv_get_tsc_khz;
@@ -565,11 +621,14 @@ static void __init ms_hyperv_init_platform(void)
 #endif
 
 #if IS_ENABLED(CONFIG_HYPERV)
+	if (hv_root_partition())
+		machine_ops.power_off = hv_machine_power_off;
 #if defined(CONFIG_KEXEC_CORE)
 	machine_ops.shutdown = hv_machine_shutdown;
 #endif
 #if defined(CONFIG_CRASH_DUMP)
-	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+	if (!hv_root_partition())
+		machine_ops.crash_shutdown = hv_guest_crash_shutdown;
 #endif
 #endif
 	/*
@@ -675,21 +734,10 @@ static bool __init ms_hyperv_x2apic_available(void)
  * pci-hyperv host bridge.
  *
  * Note: for a Hyper-V root partition, this will always return false.
- * The hypervisor doesn't expose these HYPERV_CPUID_VIRT_STACK_* cpuids by
- * default, they are implemented as intercepts by the Windows Hyper-V stack.
- * Even a nested root partition (L2 root) will not get them because the
- * nested (L1) hypervisor filters them out.
  */
 static bool __init ms_hyperv_msi_ext_dest_id(void)
 {
-	u32 eax;
-
-	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
-	if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
-		return false;
-
-	eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
-	return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+	return ms_hyperv.msi_ext_dest_id;
 }
 
 #ifdef CONFIG_AMD_MEM_ENCRYPT