diff options
Diffstat (limited to 'arch/x86/hyperv')
| -rw-r--r-- | arch/x86/hyperv/Makefile | 18 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_apic.c | 58 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_crash.c | 642 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_init.c | 346 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_proc.c | 213 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_spinlock.c | 10 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_trampoline.S | 101 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_vtl.c | 144 | ||||
| -rw-r--r-- | arch/x86/hyperv/irqdomain.c | 184 | ||||
| -rw-r--r-- | arch/x86/hyperv/ivm.c | 587 | ||||
| -rw-r--r-- | arch/x86/hyperv/mmu.c | 6 | ||||
| -rw-r--r-- | arch/x86/hyperv/mshv-asm-offsets.c | 37 | ||||
| -rw-r--r-- | arch/x86/hyperv/mshv_vtl_asm.S | 116 | ||||
| -rw-r--r-- | arch/x86/hyperv/nested.c | 3 |
14 files changed, 1968 insertions, 497 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 3a1548054b48..56292102af62 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,8 +1,22 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o -obj-$(CONFIG_X86_64) += hv_apic.o hv_proc.o -obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o +obj-$(CONFIG_X86_64) += hv_apic.o +obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o + +$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h + +$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE + $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__) ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o + + ifdef CONFIG_MSHV_ROOT + CFLAGS_REMOVE_hv_trampoline.o += -pg + CFLAGS_hv_trampoline.o += -fno-stack-protector + obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o + endif endif + +targets += mshv-asm-offsets.s +clean-files += mshv-asm-offsets.h diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 187e13b15e9a..a8de503def37 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -23,12 +23,12 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> -#include <linux/hyperv.h> #include <linux/slab.h> #include <linux/cpuhotplug.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/trace/hyperv.h> @@ -38,7 +38,7 @@ static u64 hv_apic_icr_read(void) { u64 reg_val; - rdmsrl(HV_X64_MSR_ICR, reg_val); + rdmsrq(HV_X64_MSR_ICR, reg_val); return reg_val; } @@ -50,7 +50,12 @@ static void hv_apic_icr_write(u32 low, u32 id) reg_val = reg_val << 32; reg_val |= low; - wrmsrl(HV_X64_MSR_ICR, reg_val); + wrmsrq(HV_X64_MSR_ICR, reg_val); +} + +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ + apic_update_vector(cpu, vector, set); } static u32 hv_apic_read(u32 reg) @@ -76,10 +81,10 @@ static void hv_apic_write(u32 reg, u32 val) { switch (reg) { case APIC_EOI: - wrmsr(HV_X64_MSR_EOI, val, 0); + wrmsrq(HV_X64_MSR_EOI, val); break; case APIC_TASKPRI: - wrmsr(HV_X64_MSR_TPR, val, 0); + wrmsrq(HV_X64_MSR_TPR, val); break; default: native_apic_mem_write(reg, val); @@ -93,7 +98,7 @@ static void hv_apic_eoi_write(void) if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) return; - wrmsr(HV_X64_MSR_EOI, APIC_EOI_ACK, 0); + wrmsrq(HV_X64_MSR_EOI, APIC_EOI_ACK); } static bool cpu_is_self(int cpu) @@ -105,7 +110,7 @@ static bool cpu_is_self(int cpu) * IPI implementation on Hyper-V. */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { struct hv_send_ipi_ex *ipi_arg; unsigned long flags; @@ -132,8 +137,8 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask, - exclude_self ? cpu_is_self : NULL); + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); /* * 'nr_bank <= 0' means some CPUs in cpumask can't be @@ -146,8 +151,13 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; } + /* + * For this hypercall, Hyper-V treats the valid_bank_mask field + * of ipi_arg->vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. + */ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); @@ -155,7 +165,7 @@ ipi_mask_ex_done: } static bool __send_ipi_mask(const struct cpumask *mask, int vector, - bool exclude_self) + bool exclude_self) { int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; @@ -175,10 +185,13 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; - if (!hv_hypercall_pg) - return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -206,7 +219,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, /* * This particular version of the IPI hypercall can - * only target upto 64 CPUs. + * only target up to 64 CPUs. */ if (vcpu >= 64) goto do_ex_hypercall; @@ -215,7 +228,7 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, } status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); + ipi_arg.cpu_mask); return hv_result_success(status); do_ex_hypercall: @@ -229,10 +242,16 @@ static bool __send_ipi_one(int cpu, int vector) trace_hyperv_send_ipi_one(cpu, vector); - if (!hv_hypercall_pg || (vp == VP_INVAL)) + if (vp == VP_INVAL) return false; - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } + + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; if (vp >= 64) @@ -279,6 +298,9 @@ static void hv_send_ipi_self(int vector) void __init hv_apic_init(void) { + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return; + if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { pr_info("Hyper-V: Using IPI hypercalls\n"); /* diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c new file mode 100644 index 000000000000..c0e22921ace1 --- /dev/null +++ b/arch/x86/hyperv/hv_crash.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * X86 specific Hyper-V root partition kdump/crash support module + * + * Copyright (C) 2025, Microsoft, Inc. + * + * This module implements hypervisor RAM collection into vmcore for both + * cases of the hypervisor crash and Linux root crash. Hyper-V implements + * a disable hypercall with a 32bit protected mode ABI callback. This + * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM + * is already mapped in Linux, it is automatically collected into Linux vmcore, + * and can be examined by the crash command (raw RAM dump) or windbg. + * + * At a high level: + * + * Hypervisor Crash: + * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a + * restrictive mode with very limited hypercall and MSR support. Each cpu + * then injects NMIs into root vcpus. A shared page is used to check + * by Linux in the NMI handler if the hypervisor has crashed. This shared + * page is setup in hv_root_crash_init during boot. + * + * Linux Crash: + * In case of Linux crash, the callback hv_crash_stop_other_cpus will send + * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits + * for all cpus to be in NMI. + * + * NMI Handler (upon quorum): + * Eventually, in both cases, all cpus will end up in the NMI handler. + * Hyper-V requires the disable hypervisor must be done from the BSP. So + * the BSP NMI handler saves current context, does some fixups and makes + * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor + * at that point will suspend all vcpus (except the BSP), unlock all its + * RAM, and return to Linux at the 32bit mode entry RIP. + * + * Linux 32bit entry trampoline will then restore long mode and call C + * function here to restore context and continue execution to crash kexec. + */ + +#include <linux/delay.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <linux/panic.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/mshyperv.h> +#include <asm/nmi.h> +#include <asm/idtentry.h> +#include <asm/reboot.h> +#include <asm/intel_pt.h> + +bool hv_crash_enabled; +EXPORT_SYMBOL_GPL(hv_crash_enabled); + +struct hv_crash_ctxt { + ulong rsp; + ulong cr0; + ulong cr2; + ulong cr4; + ulong cr8; + + u16 cs; + u16 ss; + u16 ds; + u16 es; + u16 fs; + u16 gs; + + u16 gdt_fill; + struct desc_ptr gdtr; + char idt_fill[6]; + struct desc_ptr idtr; + + u64 gsbase; + u64 efer; + u64 pat; +}; +static struct hv_crash_ctxt hv_crash_ctxt; + +/* Shared hypervisor page that contains crash dump area we peek into. + * NB: windbg looks for "hv_cda" symbol so don't change it. + */ +static struct hv_crashdump_area *hv_cda; + +static u32 trampoline_pa, devirt_arg; +static atomic_t crash_cpus_wait; +static void *hv_crash_ptpgs[4]; +static bool hv_has_crashed, lx_has_crashed; + +static void __noreturn hv_panic_timeout_reboot(void) +{ + #define PANIC_TIMER_STEP 100 + + if (panic_timeout > 0) { + int i; + + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) + mdelay(PANIC_TIMER_STEP); + } + + if (panic_timeout) + native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */ + + for (;;) + cpu_relax(); +} + +/* This cannot be inlined as it needs stack */ +static noinline __noclone void hv_crash_restore_tss(void) +{ + load_TR_desc(); +} + +/* This cannot be inlined as it needs stack */ +static noinline void hv_crash_clear_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + /* Clear entry so it's not confusing to someone looking at the core */ + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + native_p4d_clear(p4d); +} + +/* + * This is the C entry point from the asm glue code after the disable hypercall. + * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel + * page tables with our below 4G page identity mapped, but using a temporary + * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not + * available. We restore kernel GDT, and rest of the context, and continue + * to kexec. + */ +static asmlinkage void __noreturn hv_crash_c_entry(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + /* first thing, restore kernel gdt */ + native_load_gdt(&ctxt->gdtr); + + asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss)); + asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp)); + + asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds)); + asm volatile("movw %%ax, %%es" : : "a"(ctxt->es)); + asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs)); + asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs)); + + native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat); + asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0)); + + asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8)); + asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4)); + asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4)); + + native_load_idt(&ctxt->idtr); + native_wrmsrq(MSR_GS_BASE, ctxt->gsbase); + native_wrmsrq(MSR_EFER, ctxt->efer); + + /* restore the original kernel CS now via far return */ + asm volatile("movzwq %0, %%rax\n\t" + "pushq %%rax\n\t" + "pushq $1f\n\t" + "lretq\n\t" + "1:nop\n\t" : : "m"(ctxt->cs) : "rax"); + + /* We are in asmlinkage without stack frame, hence make C function + * calls which will buy stack frames. + */ + hv_crash_restore_tss(); + hv_crash_clear_kernpt(); + + /* we are now fully in devirtualized normal kernel mode */ + __crash_kexec(NULL); + + hv_panic_timeout_reboot(); +} +/* Tell gcc we are using lretq long jump in the above function intentionally */ +STACK_FRAME_NON_STANDARD(hv_crash_c_entry); + +static void hv_mark_tss_not_busy(void) +{ + struct desc_struct *desc = get_current_gdt_rw(); + tss_desc tss; + + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); + tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */ + write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); +} + +/* Save essential context */ +static void hv_hvcrash_ctxt_save(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp)); + + ctxt->cr0 = native_read_cr0(); + ctxt->cr4 = native_read_cr4(); + + asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2)); + asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8)); + + asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs)); + asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss)); + asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds)); + asm volatile("movl %%es, %%eax" : "=a"(ctxt->es)); + asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs)); + asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs)); + + native_store_gdt(&ctxt->gdtr); + store_idt(&ctxt->idtr); + + ctxt->gsbase = __rdmsr(MSR_GS_BASE); + ctxt->efer = __rdmsr(MSR_EFER); + ctxt->pat = __rdmsr(MSR_IA32_CR_PAT); +} + +/* Add trampoline page to the kernel pagetable for transition to kernel PT */ +static void hv_crash_fixup_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + + /* trampoline_pa is below 4G, so no pre-existing entry to clobber */ + p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]); + p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */ +} + +/* + * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce + * and suspend all guest VPs. + */ +static void hv_notify_prepare_hyp(void) +{ + u64 status; + struct hv_input_notify_partition_event *input; + struct hv_partition_event_root_crashdump_input *cda; + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + cda = &input->input.crashdump_input; + memset(input, 0, sizeof(*input)); + input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP; + + cda->crashdump_action = HV_CRASHDUMP_ENTRY; + status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); + if (!hv_result_success(status)) + return; + + cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS; + hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); +} + +/* + * Common function for all cpus before devirtualization. + * + * Hypervisor crash: all cpus get here in NMI context. + * Linux crash: the panicing cpu gets here at base level, all others in NMI + * context. Note, panicing cpu may not be the BSP. + * + * The function is not inlined so it will show on the stack. It is named so + * because the crash cmd looks for certain well known function names on the + * stack before looking into the cpu saved note in the elf section, and + * that work is currently incomplete. + * + * Notes: + * Hypervisor crash: + * - the hypervisor is in a very restrictive mode at this point and any + * vmexit it cannot handle would result in reboot. So, no mumbo jumbo, + * just get to kexec as quickly as possible. + * + * Devirtualization is supported from the BSP only at present. + */ +static noinline __noclone void crash_nmi_callback(struct pt_regs *regs) +{ + struct hv_input_disable_hyp_ex *input; + u64 status; + int msecs = 1000, ccpu = smp_processor_id(); + + if (ccpu == 0) { + /* crash_save_cpu() will be done in the kexec path */ + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + } else { + crash_save_cpu(regs, ccpu); + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + for (;;) + cpu_relax(); + } + + while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--) + mdelay(1); + + stop_nmi(); + if (!hv_has_crashed) + hv_notify_prepare_hyp(); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + hv_hvcrash_ctxt_save(); + hv_mark_tss_not_busy(); + hv_crash_fixup_kernpt(); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->rip = trampoline_pa; + input->arg = devirt_arg; + + status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL); + + hv_panic_timeout_reboot(); +} + + +static DEFINE_SPINLOCK(hv_crash_reboot_lk); + +/* + * Generic NMI callback handler: could be called without any crash also. + * hv crash: hypervisor injects NMI's into all cpus + * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus + */ +static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs) +{ + if (!hv_has_crashed && hv_cda && hv_cda->cda_valid) + hv_has_crashed = true; + + if (!hv_has_crashed && !lx_has_crashed) + return NMI_DONE; /* ignore the NMI */ + + if (hv_has_crashed && !kexec_crash_loaded()) { + if (spin_trylock(&hv_crash_reboot_lk)) + hv_panic_timeout_reboot(); + else + for (;;) + cpu_relax(); + } + + crash_nmi_callback(regs); + + return NMI_DONE; +} + +/* + * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus + * + * On normal Linux panic, this is called twice: first from panic and then again + * from native_machine_crash_shutdown. + * + * In case of hyperv, 3 ways to get here: + * 1. hv crash (only BSP will get here): + * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry + * -> __crash_kexec -> native_machine_crash_shutdown + * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus + * Linux panic: + * 2. panic cpu x: panic() -> crash_smp_send_stop + * -> smp_ops.crash_stop_other_cpus + * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop + * + * NB: noclone and non standard stack because of call to crash_setup_regs(). + */ +static void __noclone hv_crash_stop_other_cpus(void) +{ + static bool crash_stop_done; + struct pt_regs lregs; + int ccpu = smp_processor_id(); + + if (hv_has_crashed) + return; /* all cpus already in NMI handler path */ + + if (!kexec_crash_loaded()) { + hv_notify_prepare_hyp(); + hv_panic_timeout_reboot(); /* no return */ + } + + /* If the hv crashes also, we could come here again before cpus_stopped + * is set in crash_smp_send_stop(). So use our own check. + */ + if (crash_stop_done) + return; + crash_stop_done = true; + + /* Linux has crashed: hv is healthy, we can IPI safely */ + lx_has_crashed = true; + wmb(); /* NMI handlers look at lx_has_crashed */ + + apic->send_IPI_allbutself(NMI_VECTOR); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + /* crash_setup_regs() happens in kexec also, but for the kexec cpu which + * is the BSP. We could be here on non-BSP cpu, collect regs if so. + */ + if (ccpu) + crash_setup_regs(&lregs, NULL); + + crash_nmi_callback(&lregs); +} +STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus); + +/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */ +struct hv_gdtreg_32 { + u16 fill; + u16 limit; + u32 address; +} __packed; + +/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */ +struct hv_crash_tramp_gdt { + u64 null; /* index 0, selector 0, null selector */ + u64 cs64; /* index 1, selector 8, cs64 selector */ +} __packed; + +/* No stack, so jump via far ptr in memory to load the 64bit CS */ +struct hv_cs_jmptgt { + u32 address; + u16 csval; + u16 fill; +} __packed; + +/* Linux use only, hypervisor doesn't look at this struct */ +struct hv_crash_tramp_data { + u64 tramp32_cr3; + u64 kernel_cr3; + struct hv_gdtreg_32 gdtr32; + struct hv_crash_tramp_gdt tramp_gdt; + struct hv_cs_jmptgt cs_jmptgt; + u64 c_entry_addr; +} __packed; + +/* + * Setup a temporary gdt to allow the asm code to switch to the long mode. + * Since the asm code is relocated/copied to a below 4G page, it cannot use rip + * relative addressing, hence we must use trampoline_pa here. Also, save other + * info like jmp and C entry targets for same reasons. + * + * Returns: 0 on success, -1 on error + */ +static int hv_crash_setup_trampdata(u64 trampoline_va) +{ + int size, offs; + void *dest; + struct hv_crash_tramp_data *tramp; + + /* These must match exactly the ones in the corresponding asm file */ + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, + cs_jmptgt.address) != 40); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48); + + /* hv_crash_asm_end is beyond last byte by 1 */ + size = &hv_crash_asm_end - &hv_crash_asm32; + if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) { + pr_err("%s: trampoline page overflow\n", __func__); + return -1; + } + + dest = (void *)trampoline_va; + memcpy(dest, &hv_crash_asm32, size); + + dest += size; + dest = (void *)round_up((ulong)dest, 16); + tramp = (struct hv_crash_tramp_data *)dest; + + /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by + * non-PCID-aware users". Build cr3 with pcid 0 + */ + tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]); + + /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */ + tramp->kernel_cr3 = __sme_pa(init_mm.pgd); + + tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt); + tramp->gdtr32.address = trampoline_pa + + (ulong)&tramp->tramp_gdt - trampoline_va; + + /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */ + tramp->tramp_gdt.cs64 = 0x00af9a000000ffff; + + tramp->cs_jmptgt.csval = 0x8; + offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32; + tramp->cs_jmptgt.address = trampoline_pa + offs; + + tramp->c_entry_addr = (u64)&hv_crash_c_entry; + + devirt_arg = trampoline_pa + (ulong)dest - trampoline_va; + + return 0; +} + +/* + * Build 32bit trampoline page table for transition from protected mode + * non-paging to long-mode paging. This transition needs pagetables below 4G. + */ +static void hv_crash_build_tramp_pt(void) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + u64 pa, addr = trampoline_pa; + + p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d); + pa = virt_to_phys(hv_crash_ptpgs[1]); + set_p4d(p4d, __p4d(_PAGE_TABLE | pa)); + p4d->p4d &= ~(_PAGE_NX); /* enable execute */ + + pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud); + pa = virt_to_phys(hv_crash_ptpgs[2]); + set_pud(pud, __pud(_PAGE_TABLE | pa)); + + pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd); + pa = virt_to_phys(hv_crash_ptpgs[3]); + set_pmd(pmd, __pmd(_PAGE_TABLE | pa)); + + pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +/* + * Setup trampoline for devirtualization: + * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to + * in protected mode. + * - 4 pages for a temporary page table that asm code uses to turn paging on + * - a temporary gdt to use in the compat mode. + * + * Returns: 0 on success + */ +static int hv_crash_trampoline_setup(void) +{ + int i, rc, order; + struct page *page; + u64 trampoline_va; + gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO; + + /* page for 32bit trampoline assembly code + hv_crash_tramp_data */ + page = alloc_page(flags32); + if (page == NULL) { + pr_err("%s: failed to alloc asm stub page\n", __func__); + return -1; + } + + trampoline_va = (u64)page_to_virt(page); + trampoline_pa = (u32)page_to_phys(page); + + order = 2; /* alloc 2^2 pages */ + page = alloc_pages(flags32, order); + if (page == NULL) { + pr_err("%s: failed to alloc pt pages\n", __func__); + free_page(trampoline_va); + return -1; + } + + for (i = 0; i < 4; i++, page++) + hv_crash_ptpgs[i] = page_to_virt(page); + + hv_crash_build_tramp_pt(); + + rc = hv_crash_setup_trampdata(trampoline_va); + if (rc) + goto errout; + + return 0; + +errout: + free_page(trampoline_va); + free_pages((ulong)hv_crash_ptpgs[0], order); + + return rc; +} + +/* Setup for kdump kexec to collect hypervisor RAM when running as root */ +void hv_root_crash_init(void) +{ + int rc; + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + union hv_pfn_range cda_info; + + if (pgtable_l5_enabled()) { + pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n"); + return; + } + + rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST, + "hv_crash_nmi"); + if (rc) { + pr_err("Hyper-V: failed to register crash nmi handler\n"); + return; + } + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + cda_info.as_uint64 = output->hv_cda_info.as_uint64; + local_irq_restore(flags); + + if (!hv_result_success(status)) { + pr_err("Hyper-V: %s: property:%d %s\n", __func__, + input->property_id, hv_result_to_string(status)); + goto err_out; + } + + if (cda_info.base_pfn == 0) { + pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n"); + goto err_out; + } + + hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT); + + rc = hv_crash_trampoline_setup(); + if (rc) + goto err_out; + + smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus; + + crash_kexec_post_notifiers = true; + hv_crash_enabled = true; + pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n"); + + return; + +err_out: + unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi"); + pr_err("Hyper-V: only linux root kdump support enabled\n"); +} diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 507d98331e7c..14de43f4bc6c 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -7,36 +7,74 @@ * Author : K. Y. Srinivasan <kys@microsoft.com> */ +#define pr_fmt(fmt) "Hyper-V: " fmt + #include <linux/efi.h> #include <linux/types.h> #include <linux/bitfield.h> #include <linux/io.h> #include <asm/apic.h> #include <asm/desc.h> +#include <asm/e820/api.h> #include <asm/sev.h> -#include <asm/ibt.h> #include <asm/hypervisor.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/idtentry.h> +#include <asm/set_memory.h> #include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/hyperv.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/cpuhotplug.h> #include <linux/syscore_ops.h> #include <clocksource/hyperv_timer.h> #include <linux/highmem.h> - -int hyperv_init_cpuhp; -u64 hv_current_partition_id = ~0ull; -EXPORT_SYMBOL_GPL(hv_current_partition_id); +#include <linux/export.h> void *hv_hypercall_pg; + +#ifdef CONFIG_X86_64 +static u64 __hv_hyperfail(u64 control, u64 param1, u64 param2) +{ + return U64_MAX; +} + +DEFINE_STATIC_CALL(__hv_hypercall, __hv_hyperfail); + +u64 hv_std_hypercall(u64 control, u64 param1, u64 param2) +{ + u64 hv_status; + + register u64 __r8 asm("r8") = param2; + asm volatile ("call " STATIC_CALL_TRAMP_STR(__hv_hypercall) + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); + + return hv_status; +} + +typedef u64 (*hv_hypercall_f)(u64 control, u64 param1, u64 param2); + +static inline void hv_set_hypercall_pg(void *ptr) +{ + hv_hypercall_pg = ptr; + + if (!ptr) + ptr = &__hv_hyperfail; + static_call_update(__hv_hypercall, (hv_hypercall_f)ptr); +} +#else +static inline void hv_set_hypercall_pg(void *ptr) +{ + hv_hypercall_pg = ptr; +} EXPORT_SYMBOL_GPL(hv_hypercall_pg); +#endif union hv_ghcb * __percpu *hv_ghcb_pg; @@ -52,7 +90,7 @@ static int hyperv_init_ghcb(void) void *ghcb_va; void **ghcb_base; - if (!hv_isolation_type_snp()) + if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) return 0; if (!hv_ghcb_pg) @@ -63,7 +101,7 @@ static int hyperv_init_ghcb(void) * returned by MSR_AMD64_SEV_ES_GHCB is above shared * memory boundary and map it here. */ - rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); + rdmsrq(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); /* Mask out vTOM bit. ioremap_cache() maps decrypted */ ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; @@ -80,7 +118,7 @@ static int hyperv_init_ghcb(void) static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; + struct hv_vp_assist_page **hvp; int ret; ret = hv_common_cpu_init(cpu); @@ -90,12 +128,13 @@ static int hv_cpu_init(unsigned int cpu) if (!hv_vp_assist_page) return 0; - if (hv_root_partition) { + hvp = &hv_vp_assist_page[cpu]; + if (hv_root_partition()) { /* * For root partition we get the hypervisor provided VP assist * page, instead of allocating a new page. */ - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, PAGE_SIZE, MEMREMAP_WB); } else { @@ -107,17 +146,34 @@ static int hv_cpu_init(unsigned int cpu) * in hv_cpu_die(), otherwise a CPU may not be stopped in the * case of CPU offlining and the VM will hang. */ - if (!*hvp) + if (!*hvp) { *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + + /* + * Hyper-V should never specify a VM that is a Confidential + * VM and also running in the root partition. Root partition + * is blocked to run in Confidential VM. So only decrypt assist + * page in non-root partition here. + */ + if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); + memset(*hvp, 0, PAGE_SIZE); + } + } + if (*hvp) msr.pfn = vmalloc_to_pfn(*hvp); } if (!WARN_ON(!(*hvp))) { msr.enable = 1; - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } + /* Allow Hyper-V stimer vector to be injected from Hypervisor. */ + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true); + return hyperv_init_ghcb(); } @@ -127,7 +183,7 @@ static void hv_reenlightenment_notify(struct work_struct *dummy) { struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); /* Don't issue the callback if TSC accesses are not emulated */ if (hv_reenlightenment_cb && emu_status.inprogress) @@ -140,11 +196,11 @@ void hyperv_stop_tsc_emulation(void) u64 freq; struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); emu_status.inprogress = 0; - wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + wrmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); tsc_khz = div64_u64(freq, 1000); } EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); @@ -176,7 +232,7 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; if (!hv_reenlightenment_available()) { - pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + pr_warn("reenlightenment support is unavailable\n"); return; } @@ -190,8 +246,8 @@ void set_hv_tscchange_cb(void (*cb)(void)) re_ctrl.target_vp = hv_vp_index[get_cpu()]; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); - wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrq(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); put_cpu(); } @@ -204,9 +260,9 @@ void clear_hv_tscchange_cb(void) if (!hv_reenlightenment_available()) return; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); re_ctrl.enabled = 0; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); hv_reenlightenment_cb = NULL; } @@ -225,11 +281,14 @@ static int hv_cpu_die(unsigned int cpu) *ghcb_va = NULL; } + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false); + hv_common_cpu_die(cpu); if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { union hv_vp_assist_msr_contents msr = { 0 }; - if (hv_root_partition) { + if (hv_root_partition()) { /* * For root partition the VP assist page is mapped to * hypervisor provided page, and thus we unmap the @@ -238,16 +297,16 @@ static int hv_cpu_die(unsigned int cpu) */ memunmap(hv_vp_assist_page[cpu]); hv_vp_assist_page[cpu] = NULL; - rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); msr.enable = 0; } - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } if (hv_reenlightenment_cb == NULL) return 0; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); if (re_ctrl.target_vp == hv_vp_index[cpu]) { /* * Reassign reenlightenment notifications to some other online @@ -261,7 +320,7 @@ static int hv_cpu_die(unsigned int cpu) else re_ctrl.enabled = 0; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); } return 0; @@ -269,26 +328,42 @@ static int hv_cpu_die(unsigned int cpu) static int __init hv_pci_init(void) { - int gen2vm = efi_enabled(EFI_BOOT); + bool gen2vm = efi_enabled(EFI_BOOT); /* - * For Generation-2 VM, we exit from pci_arch_init() by returning 0. - * The purpose is to suppress the harmless warning: + * A Generation-2 VM doesn't support legacy PCI/PCIe, so both + * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> + * pcibios_init() doesn't call pcibios_resource_survey() -> + * e820__reserve_resources_late(); as a result, any emulated persistent + * memory of E820_TYPE_PRAM (12) via the kernel parameter + * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be + * detected by register_e820_pmem(). Fix this by directly calling + * e820__reserve_resources_late() here: e820__reserve_resources_late() + * depends on e820__reserve_resources(), which has been called earlier + * from setup_arch(). Note: e820__reserve_resources_late() also adds + * any memory of E820_TYPE_PMEM (7) into iomem_resource, and + * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> + * region_intersects() returns REGION_INTERSECTS, so the memory of + * E820_TYPE_PMEM won't get added twice. + * + * We return 0 here so that pci_arch_init() won't print the warning: * "PCI: Fatal: No config space access function found" */ - if (gen2vm) + if (gen2vm) { + e820__reserve_resources_late(); return 0; + } /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ return 1; } -static int hv_suspend(void) +static int hv_suspend(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; - if (hv_root_partition) + if (hv_root_partition()) return -EPERM; /* @@ -299,18 +374,18 @@ static int hv_suspend(void) * pointer is restored on resume. */ hv_hypercall_pg_saved = hv_hypercall_pg; - hv_hypercall_pg = NULL; + hv_set_hypercall_pg(NULL); /* Disable the hypercall page in the hypervisor */ - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 0; - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); ret = hv_cpu_die(0); return ret; } -static void hv_resume(void) +static void hv_resume(void *data) { union hv_x64_msr_hypercall_contents hypercall_msr; int ret; @@ -319,13 +394,13 @@ static void hv_resume(void) WARN_ON(ret); /* Re-enable the hypercall page */ - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg_saved); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); - hv_hypercall_pg = hv_hypercall_pg_saved; + hv_set_hypercall_pg(hv_hypercall_pg_saved); hv_hypercall_pg_saved = NULL; /* @@ -337,11 +412,15 @@ static void hv_resume(void) } /* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ -static struct syscore_ops hv_syscore_ops = { +static const struct syscore_ops hv_syscore_ops = { .suspend = hv_suspend, .resume = hv_resume, }; +static struct syscore hv_syscore = { + .ops = &hv_syscore_ops, +}; + static void (* __initdata old_setup_percpu_clockev)(void); static void __init hv_stimer_setup_percpu_clockev(void) @@ -361,24 +440,6 @@ static void __init hv_stimer_setup_percpu_clockev(void) old_setup_percpu_clockev(); } -static void __init hv_get_partition_id(void) -{ - struct hv_get_partition_id *output_page; - u64 status; - unsigned long flags; - - local_irq_save(flags); - output_page = *this_cpu_ptr(hyperv_pcpu_output_arg); - status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page); - if (!hv_result_success(status)) { - /* No point in proceeding if this failed */ - pr_err("Failed to get partition ID: %lld\n", status); - BUG(); - } - hv_current_partition_id = output_page->partition_id; - local_irq_restore(flags); -} - /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -399,14 +460,24 @@ void __init hyperv_init(void) if (hv_common_init()) return; - hv_vp_assist_page = kcalloc(num_possible_cpus(), - sizeof(*hv_vp_assist_page), GFP_KERNEL); + /* + * The VP assist page is useless to a TDX guest: the only use we + * would have for it is lazy EOI, which can not be used with TDX. + */ + if (hv_isolation_type_tdx()) + hv_vp_assist_page = NULL; + else + hv_vp_assist_page = kcalloc(nr_cpu_ids, + sizeof(*hv_vp_assist_page), + GFP_KERNEL); if (!hv_vp_assist_page) { ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - goto common_free; + + if (!hv_isolation_type_tdx()) + goto common_free; } - if (hv_isolation_type_snp()) { + if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { /* Negotiate GHCB Version. */ if (!hv_ghcb_negotiate_protocol()) hv_ghcb_terminate(SEV_TERM_SET_GEN, @@ -426,24 +497,44 @@ void __init hyperv_init(void) * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page + * + * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: + * when the hypercall input is a page, such a VM must pass a decrypted + * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page + * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. + * + * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, + * which are handled by the paravisor and the VM must use an encrypted + * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and + * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and + * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: + * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). + * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. + * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; + * instead, hv_post_message() uses the post_msg_page, which is decrypted + * in such a VM and is only used in such a VM. */ guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); - wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); - /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */ - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, - VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + goto skip_hypercall_pg_init; + + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __builtin_return_address(0)); if (hv_hypercall_pg == NULL) goto clean_guest_os_id; - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - if (hv_root_partition) { + if (hv_root_partition()) { struct page *pg; void *src; @@ -457,7 +548,7 @@ void __init hyperv_init(void) * so it is populated with code, then copy the code to an * executable page. */ - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); pg = vmalloc_to_page(hv_hypercall_pg); src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, @@ -467,31 +558,16 @@ void __init hyperv_init(void) memunmap(src); hv_remap_tsc_clocksource(); + hv_root_crash_init(); + hv_sleep_notifiers_register(); } else { hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } - /* - * Some versions of Hyper-V that provide IBT in guest VMs have a bug - * in that there's no ENDBR64 instruction at the entry to the - * hypercall page. Because hypercalls are invoked via an indirect call - * to the hypercall page, all hypercall attempts fail when IBT is - * enabled, and Linux panics. For such buggy versions, disable IBT. - * - * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall - * page, so if future Linux kernel versions enable IBT for 32-bit - * builds, additional hypercall page hackery will be required here - * to provide an ENDBR32. - */ -#ifdef CONFIG_X86_KERNEL_IBT - if (cpu_feature_enabled(X86_FEATURE_IBT) && - *(u32 *)hv_hypercall_pg != gen_endbr()) { - setup_clear_cpu_cap(X86_FEATURE_IBT); - pr_warn("Hyper-V: Disabling IBT because of Hyper-V bug\n"); - } -#endif + hv_set_hypercall_pg(hv_hypercall_pg); +skip_hypercall_pg_init: /* * hyperv_init() is called before LAPIC is initialized: see * apic_intr_mode_init() -> x86_platform.apic_post_init() and @@ -506,33 +582,35 @@ void __init hyperv_init(void) x86_init.pci.arch_init = hv_pci_init; - register_syscore_ops(&hv_syscore_ops); + register_syscore(&hv_syscore); - hyperv_init_cpuhp = cpuhp; - - if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_ACCESS_PARTITION_ID) + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) hv_get_partition_id(); - BUG_ON(hv_root_partition && hv_current_partition_id == ~0ull); - #ifdef CONFIG_PCI_MSI /* * If we're running as root, we want to create our own PCI MSI domain. * We can't set this in hv_pci_init because that would be too late. */ - if (hv_root_partition) + if (hv_root_partition()) x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; #endif /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); + /* Find the VTL */ + ms_hyperv.vtl = get_vtl(); + + if (ms_hyperv.vtl > 0) /* non default VTL */ + hv_vtl_early_init(); + return; clean_guest_os_id: - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); - cpuhp_remove_state(cpuhp); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); free_ghcb_page: free_percpu(hv_ghcb_pg); free_vp_assist_page: @@ -551,8 +629,8 @@ void hyperv_cleanup(void) union hv_reference_tsc_msr tsc_msr; /* Reset our OS id */ - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* * Reset hypercall page reference before reset the page, @@ -562,14 +640,14 @@ void hyperv_cleanup(void) hv_hypercall_pg = NULL; /* Reset the hypercall page */ - hypercall_msr.as_uint64 = hv_get_register(HV_X64_MSR_HYPERCALL); + hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL); hypercall_msr.enable = 0; - hv_set_register(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hv_set_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* Reset the TSC page */ - tsc_msr.as_uint64 = hv_get_register(HV_X64_MSR_REFERENCE_TSC); + tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC); tsc_msr.enable = 0; - hv_set_register(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + hv_set_msr(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) @@ -589,18 +667,18 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) return; panic_reported = true; - rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + rdmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P0, err); - wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); - wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); - wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); + wrmsrq(HV_X64_MSR_CRASH_P0, err); + wrmsrq(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrq(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrq(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrq(HV_X64_MSR_CRASH_P4, regs->sp); /* * Let Hyper-V know there is crash data available */ - wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); + wrmsrq(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); } EXPORT_SYMBOL_GPL(hyperv_report_panic); @@ -615,13 +693,49 @@ bool hv_is_hyperv_initialized(void) if (x86_hyper_type != X86_HYPER_MS_HYPERV) return false; + /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + return true; /* * Verify that earlier initialization succeeded by checking * that the hypercall page is setup */ hypercall_msr.as_uint64 = 0; - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); return hypercall_msr.enable; } EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); + +int hv_apicid_to_vp_index(u32 apic_id) +{ + u64 control; + u64 status; + unsigned long irq_flags; + struct hv_get_vp_from_apic_id_in *input; + u32 *output, ret; + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + input->apic_ids[0] = apic_id; + + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID; + status = hv_do_hypercall(control, input, output); + ret = output[0]; + + local_irq_restore(irq_flags); + + if (!hv_result_success(status)) { + pr_err("failed to get vp index from apic id %d, status %#llx\n", + apic_id, status); + return -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index); diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c deleted file mode 100644 index 68a0843d4750..000000000000 --- a/arch/x86/hyperv/hv_proc.c +++ /dev/null @@ -1,213 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/types.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> -#include <linux/clockchips.h> -#include <linux/acpi.h> -#include <linux/hyperv.h> -#include <linux/slab.h> -#include <linux/cpuhotplug.h> -#include <linux/minmax.h> -#include <asm/hypervisor.h> -#include <asm/mshyperv.h> -#include <asm/apic.h> - -#include <asm/trace/hyperv.h> - -/* - * See struct hv_deposit_memory. The first u64 is partition ID, the rest - * are GPAs. - */ -#define HV_DEPOSIT_MAX (HV_HYP_PAGE_SIZE / sizeof(u64) - 1) - -/* Deposits exact number of pages. Must be called with interrupts enabled. */ -int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages) -{ - struct page **pages, *page; - int *counts; - int num_allocations; - int i, j, page_count; - int order; - u64 status; - int ret; - u64 base_pfn; - struct hv_deposit_memory *input_page; - unsigned long flags; - - if (num_pages > HV_DEPOSIT_MAX) - return -E2BIG; - if (!num_pages) - return 0; - - /* One buffer for page pointers and counts */ - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - pages = page_address(page); - - counts = kcalloc(HV_DEPOSIT_MAX, sizeof(int), GFP_KERNEL); - if (!counts) { - free_page((unsigned long)pages); - return -ENOMEM; - } - - /* Allocate all the pages before disabling interrupts */ - i = 0; - - while (num_pages) { - /* Find highest order we can actually allocate */ - order = 31 - __builtin_clz(num_pages); - - while (1) { - pages[i] = alloc_pages_node(node, GFP_KERNEL, order); - if (pages[i]) - break; - if (!order) { - ret = -ENOMEM; - num_allocations = i; - goto err_free_allocations; - } - --order; - } - - split_page(pages[i], order); - counts[i] = 1 << order; - num_pages -= counts[i]; - i++; - } - num_allocations = i; - - local_irq_save(flags); - - input_page = *this_cpu_ptr(hyperv_pcpu_input_arg); - - input_page->partition_id = partition_id; - - /* Populate gpa_page_list - these will fit on the input page */ - for (i = 0, page_count = 0; i < num_allocations; ++i) { - base_pfn = page_to_pfn(pages[i]); - for (j = 0; j < counts[i]; ++j, ++page_count) - input_page->gpa_page_list[page_count] = base_pfn + j; - } - status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY, - page_count, 0, input_page, NULL); - local_irq_restore(flags); - if (!hv_result_success(status)) { - pr_err("Failed to deposit pages: %lld\n", status); - ret = hv_result(status); - goto err_free_allocations; - } - - ret = 0; - goto free_buf; - -err_free_allocations: - for (i = 0; i < num_allocations; ++i) { - base_pfn = page_to_pfn(pages[i]); - for (j = 0; j < counts[i]; ++j) - __free_page(pfn_to_page(base_pfn + j)); - } - -free_buf: - free_page((unsigned long)pages); - kfree(counts); - return ret; -} - -int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id) -{ - struct hv_add_logical_processor_in *input; - struct hv_add_logical_processor_out *output; - u64 status; - unsigned long flags; - int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); - - /* - * When adding a logical processor, the hypervisor may return - * HV_STATUS_INSUFFICIENT_MEMORY. When that happens, we deposit more - * pages and retry. - */ - do { - local_irq_save(flags); - - input = *this_cpu_ptr(hyperv_pcpu_input_arg); - /* We don't do anything with the output right now */ - output = *this_cpu_ptr(hyperv_pcpu_output_arg); - - input->lp_index = lp_index; - input->apic_id = apic_id; - input->flags = 0; - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - status = hv_do_hypercall(HVCALL_ADD_LOGICAL_PROCESSOR, - input, output); - local_irq_restore(flags); - - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { - if (!hv_result_success(status)) { - pr_err("%s: cpu %u apic ID %u, %lld\n", __func__, - lp_index, apic_id, status); - ret = hv_result(status); - } - break; - } - ret = hv_call_deposit_pages(node, hv_current_partition_id, 1); - } while (!ret); - - return ret; -} - -int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags) -{ - struct hv_create_vp *input; - u64 status; - unsigned long irq_flags; - int ret = HV_STATUS_SUCCESS; - int pxm = node_to_pxm(node); - - /* Root VPs don't seem to need pages deposited */ - if (partition_id != hv_current_partition_id) { - /* The value 90 is empirically determined. It may change. */ - ret = hv_call_deposit_pages(node, partition_id, 90); - if (ret) - return ret; - } - - do { - local_irq_save(irq_flags); - - input = *this_cpu_ptr(hyperv_pcpu_input_arg); - - input->partition_id = partition_id; - input->vp_index = vp_index; - input->flags = flags; - input->subnode_type = HvSubnodeAny; - if (node != NUMA_NO_NODE) { - input->proximity_domain_info.domain_id = pxm; - input->proximity_domain_info.flags.reserved = 0; - input->proximity_domain_info.flags.proximity_info_valid = 1; - input->proximity_domain_info.flags.proximity_preferred = 1; - } else { - input->proximity_domain_info.as_uint64 = 0; - } - status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL); - local_irq_restore(irq_flags); - - if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) { - if (!hv_result_success(status)) { - pr_err("%s: vcpu %u, lp %u, %lld\n", __func__, - vp_index, flags, status); - ret = hv_result(status); - } - break; - } - ret = hv_call_deposit_pages(node, partition_id, 1); - - } while (!ret); - - return ret; -} - diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index 737d6f7a6155..81b006601370 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -15,8 +15,9 @@ #include <asm/mshyperv.h> #include <asm/paravirt.h> #include <asm/apic.h> +#include <asm/msr.h> -static bool __initdata hv_pvspin = true; +static bool hv_pvspin __initdata = true; static void hv_qlock_kick(int cpu) { @@ -39,18 +40,18 @@ static void hv_qlock_wait(u8 *byte, u8 val) * To prevent a race against the unlock path it is required to * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between - * the lock value check and the rdmsrl() then the vCPU might be put + * the lock value check and the rdmsrq() then the vCPU might be put * into 'idle' state by the hypervisor and kept in that state for * an unspecified amount of time. */ local_irq_save(flags); /* - * Only issue the rdmsrl() when the lock state has not changed. + * Only issue the rdmsrq() when the lock state has not changed. */ if (READ_ONCE(*byte) == val) { unsigned long msr_val; - rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + rdmsrq(HV_X64_MSR_GUEST_IDLE, msr_val); (void)msr_val; } @@ -64,6 +65,7 @@ __visible bool hv_vcpu_is_preempted(int vcpu) { return false; } + PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted); void __init hv_init_spinlocks(void) diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S new file mode 100644 index 000000000000..25f02ff12286 --- /dev/null +++ b/arch/x86/hyperv/hv_trampoline.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * X86 specific Hyper-V kdump/crash related code. + * + * Copyright (C) 2025, Microsoft, Inc. + * + */ +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/msr.h> +#include <asm/processor-flags.h> +#include <asm/nospec-branch.h> + +/* + * void noreturn hv_crash_asm32(arg1) + * arg1 == edi == 32bit PA of struct hv_crash_tramp_data + * + * The hypervisor jumps here upon devirtualization in protected mode. This + * code gets copied to a page in the low 4G ie, 32bit space so it can run + * in the protected mode. Hence we cannot use any compile/link time offsets or + * addresses. It restores long mode via temporary gdt and page tables and + * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry. + * + * PreCondition (ie, Hypervisor call back ABI): + * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled + * o CR4 is set to 0x0 + * o IA32_EFER is set to 0x901 (SCE and NXE are set) + * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX. + * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF + * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF + * o LDTR is initialized as invalid (limit of 0) + * o MSR PAT is power on default. + * o Other state/registers are cleared. All TLBs flushed. + */ + +#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */ +#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */ +#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */ +#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */ +#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */ + + .text + .code32 + +SYM_CODE_START(hv_crash_asm32) + UNWIND_HINT_UNDEFINED + ENDBR + movl $X86_CR4_PAE, %ecx + movl %ecx, %cr4 + + movl %edi, %ebx + add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx + movl %cs:(%ebx), %eax + movl %eax, %cr3 + + /* Setup EFER for long mode now */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Turn paging on using the temp 32bit trampoline page table */ + movl %cr0, %eax + orl $(X86_CR0_PG), %eax + movl %eax, %cr0 + + /* since kernel cr3 could be above 4G, we need to be in the long mode + * before we can load 64bits of the kernel cr3. We use a temp gdt for + * that with CS.L=1 and CS.D=0 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax + lgdtl %cs:(%eax) + + /* not done yet, restore CS now to switch to CS.L=1 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax + ljmp %cs:*(%eax) +SYM_CODE_END(hv_crash_asm32) + + /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */ + .code64 + .balign 8 +SYM_CODE_START(hv_crash_asm64) + UNWIND_HINT_UNDEFINED + ENDBR + /* restore kernel page tables so we can jump to kernel code */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_KERNCR3, %eax + movq %cs:(%eax), %rbx + movq %rbx, %cr3 + + mov %edi, %eax + add $HV_CRASHDATA_OFFS_C_entry, %eax + movq %cs:(%eax), %rbx + ANNOTATE_RETPOLINE_SAFE + jmp *%rbx + + int $3 + +SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL) +SYM_CODE_END(hv_crash_asm64) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c index 36a562218010..c0edaed0efb3 100644 --- a/arch/x86/hyperv/hv_vtl.c +++ b/arch/x86/hyperv/hv_vtl.c @@ -9,25 +9,74 @@ #include <asm/apic.h> #include <asm/boot.h> #include <asm/desc.h> +#include <asm/fpu/api.h> +#include <asm/fpu/types.h> #include <asm/i8259.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/realmode.h> +#include <asm/reboot.h> +#include <asm/smap.h> +#include <linux/export.h> +#include <../kernel/smpboot.h> +#include "../../kernel/fpu/legacy.h" extern struct boot_params boot_params; static struct real_mode_header hv_vtl_real_mode_header; +static bool __init hv_vtl_msi_ext_dest_id(void) +{ + return true; +} + +/* + * The `native_machine_emergency_restart` function from `reboot.c` writes + * to the physical address 0x472 to indicate the type of reboot for the + * firmware. We cannot have that in VSM as the memory composition might + * be more generic, and such write effectively corrupts the memory thus + * making diagnostics harder at the very least. + */ +static void __noreturn hv_vtl_emergency_restart(void) +{ + /* + * Cause a triple fault and the immediate reset. Here the code does not run + * on the top of any firmware, whereby cannot reach out to its services. + * The inifinite loop is for the improbable case that the triple fault does + * not work and have to preserve the state intact for debugging. + */ + for (;;) { + idt_invalidate(); + __asm__ __volatile__("int3"); + } +} + +/* + * The only way to restart in the VTL mode is to triple fault as the kernel runs + * as firmware. + */ +static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) +{ + hv_vtl_emergency_restart(); +} + void __init hv_vtl_init_platform(void) { - pr_info("Linux runs in Hyper-V Virtual Trust Level\n"); + /* + * This function is a no-op if the VTL mode is not enabled. + * If it is, this function runs if and only the kernel boots in + * VTL2 which the x86 hv initialization path makes sure of. + */ + pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl); x86_platform.realmode_reserve = x86_init_noop; x86_platform.realmode_init = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; x86_init.timers.timer_init = x86_init_noop; + x86_init.resources.probe_roms = x86_init_noop; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; x86_platform.get_wallclock = get_rtc_noop; x86_platform.set_wallclock = set_rtc_noop; @@ -38,6 +87,8 @@ void __init hv_vtl_init_platform(void) x86_platform.legacy.warm_reset = 0; x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 0; + + x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; } static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) @@ -57,7 +108,7 @@ static void hv_vtl_ap_entry(void) ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params); } -static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) +static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) { u64 status; int ret = 0; @@ -71,7 +122,9 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) struct ldttss_desc *ldt; struct desc_struct *gdt; - u64 rsp = current->thread.sp; + struct task_struct *idle = idle_thread_get(cpu); + u64 rsp = (unsigned long)idle->thread.sp; + u64 rip = (u64)&hv_vtl_ap_entry; native_store_gdt(&gdt_ptr); @@ -107,11 +160,11 @@ static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored) input->vp_context.rip = rip; input->vp_context.rsp = rsp; input->vp_context.rflags = 0x0000000000000002; - input->vp_context.efer = __rdmsr(MSR_EFER); + input->vp_context.efer = native_rdmsrq(MSR_EFER); input->vp_context.cr0 = native_read_cr0(); input->vp_context.cr3 = __native_read_cr3(); input->vp_context.cr4 = native_read_cr4(); - input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT); + input->vp_context.msr_cr_pat = native_rdmsrq(MSR_IA32_CR_PAT); input->vp_context.idtr.limit = idt_ptr.size; input->vp_context.idtr.base = idt_ptr.address; input->vp_context.gdtr.limit = gdt_ptr.size; @@ -164,59 +217,30 @@ free_lock: return ret; } -static int hv_vtl_apicid_to_vp_id(u32 apic_id) +static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu) { - u64 control; - u64 status; - unsigned long irq_flags; - struct hv_get_vp_from_apic_id_in *input; - u32 *output, ret; - - local_irq_save(irq_flags); - - input = *this_cpu_ptr(hyperv_pcpu_input_arg); - memset(input, 0, sizeof(*input)); - input->partition_id = HV_PARTITION_ID_SELF; - input->apic_ids[0] = apic_id; - - output = (u32 *)input; - - control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID; - status = hv_do_hypercall(control, input, output); - ret = output[0]; - - local_irq_restore(irq_flags); - - if (!hv_result_success(status)) { - pr_err("failed to get vp id from apic id %d, status %#llx\n", - apic_id, status); - return -EINVAL; - } - - return ret; -} - -static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip) -{ - int vp_id; + int vp_index; pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid); - vp_id = hv_vtl_apicid_to_vp_id(apicid); + vp_index = hv_apicid_to_vp_index(apicid); - if (vp_id < 0) { + if (vp_index < 0) { pr_err("Couldn't find CPU with APIC ID %d\n", apicid); return -EINVAL; } - if (vp_id > ms_hyperv.max_vp_index) { - pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid); + if (vp_index > ms_hyperv.max_vp_index) { + pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid); return -EINVAL; } - return hv_vtl_bringup_vcpu(vp_id, start_eip); + return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip); } -static int __init hv_vtl_early_init(void) +int __init hv_vtl_early_init(void) { + machine_ops.emergency_restart = hv_vtl_emergency_restart; + machine_ops.restart = hv_vtl_restart; + /* * `boot_cpu_has` returns the runtime feature support, * and here is the earliest it can be used. @@ -230,4 +254,28 @@ static int __init hv_vtl_early_init(void) return 0; } -early_initcall(hv_vtl_early_init); + +DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void)); + +void mshv_vtl_return_call_init(u64 vtl_return_offset) +{ + static_call_update(__mshv_vtl_return_hypercall, + (void *)((u8 *)hv_hypercall_pg + vtl_return_offset)); +} +EXPORT_SYMBOL(mshv_vtl_return_call_init); + +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + hvp->vtl_ret_x64rax = vtl0->rax; + hvp->vtl_ret_x64rcx = vtl0->rcx; + + kernel_fpu_begin_mask(0); + fxrstor(&vtl0->fx_state); + __mshv_vtl_return_call(vtl0); + fxsave(&vtl0->fx_state); + kernel_fpu_end(); +} +EXPORT_SYMBOL(mshv_vtl_return_call); diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 42c70d28ef27..c3ba12b1bc07 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -10,6 +10,8 @@ #include <linux/pci.h> #include <linux/irq.h> +#include <linux/export.h> +#include <linux/irqchip/irq-msi-lib.h> #include <asm/mshyperv.h> static int hv_map_interrupt(union hv_device_id device_id, bool level, @@ -46,7 +48,7 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, if (nr_bank < 0) { local_irq_restore(flags); pr_err("%s: unable to generate VP set\n", __func__); - return EINVAL; + return -EINVAL; } intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; @@ -64,9 +66,9 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level, local_irq_restore(flags); if (!hv_result_success(status)) - pr_err("%s: hypercall failed, status %lld\n", __func__, status); + hv_status_err(status, "\n"); - return hv_result(status); + return hv_result_to_errno(status); } static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) @@ -88,7 +90,10 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); local_irq_restore(flags); - return hv_result(status); + if (!hv_result_success(status)) + hv_status_err(status, "\n"); + + return hv_result_to_errno(status); } #ifdef CONFIG_PCI_MSI @@ -169,13 +174,34 @@ static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) return dev_id; } -static int hv_map_msi_interrupt(struct pci_dev *dev, int cpu, int vector, - struct hv_interrupt_entry *entry) +/** + * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor. + * @data: Describes the IRQ + * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL) + * + * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall. + * + * Return: 0 on success, -errno on failure + */ +int hv_map_msi_interrupt(struct irq_data *data, + struct hv_interrupt_entry *out_entry) { - union hv_device_id device_id = hv_build_pci_dev_id(dev); + struct irq_cfg *cfg = irqd_cfg(data); + struct hv_interrupt_entry dummy; + union hv_device_id device_id; + struct msi_desc *msidesc; + struct pci_dev *dev; + int cpu; - return hv_map_interrupt(device_id, false, cpu, vector, entry); + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + device_id = hv_build_pci_dev_id(dev); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(data)); + + return hv_map_interrupt(device_id, false, cpu, cfg->vector, + out_entry ? out_entry : &dummy); } +EXPORT_SYMBOL_GPL(hv_map_msi_interrupt); static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) { @@ -188,13 +214,11 @@ static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { + struct hv_interrupt_entry *stored_entry; + struct irq_cfg *cfg = irqd_cfg(data); struct msi_desc *msidesc; struct pci_dev *dev; - struct hv_interrupt_entry out_entry, *stored_entry; - struct irq_cfg *cfg = irqd_cfg(data); - const cpumask_t *affinity; - int cpu; - u64 status; + int ret; msidesc = irq_data_get_msi_desc(data); dev = msi_desc_to_pci_dev(msidesc); @@ -204,29 +228,24 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - affinity = irq_data_get_effective_affinity_mask(data); - cpu = cpumask_first_and(affinity, cpu_online_mask); - if (data->chip_data) { /* * This interrupt is already mapped. Let's unmap first. * * We don't use retarget interrupt hypercalls here because - * Microsoft Hypervisor doens't allow root to change the vector + * Microsoft Hypervisor doesn't allow root to change the vector * or specify VPs outside of the set that is initially used * during mapping. */ stored_entry = data->chip_data; data->chip_data = NULL; - status = hv_unmap_msi_interrupt(dev, stored_entry); + ret = hv_unmap_msi_interrupt(dev, stored_entry); kfree(stored_entry); - if (status != HV_STATUS_SUCCESS) { - pr_debug("%s: failed to unmap, status %lld", __func__, status); + if (ret) return; - } } stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); @@ -235,15 +254,14 @@ static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - status = hv_map_msi_interrupt(dev, cpu, cfg->vector, &out_entry); - if (status != HV_STATUS_SUCCESS) { + ret = hv_map_msi_interrupt(data, stored_entry); + if (ret) { kfree(stored_entry); return; } - *stored_entry = out_entry; data->chip_data = stored_entry; - entry_to_msi_msg(&out_entry, msg); + entry_to_msi_msg(data->chip_data, msg); return; } @@ -257,7 +275,6 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) { struct hv_interrupt_entry old_entry; struct msi_msg msg; - u64 status; if (!irqd->chip_data) { pr_debug("%s: no chip data\n!", __func__); @@ -270,26 +287,7 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) kfree(irqd->chip_data); irqd->chip_data = NULL; - status = hv_unmap_msi_interrupt(dev, &old_entry); - - if (status != HV_STATUS_SUCCESS) - pr_err("%s: hypercall failed, status %lld\n", __func__, status); -} - -static void hv_msi_free_irq(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - struct irq_data *irqd = irq_get_irq_data(virq); - struct msi_desc *desc; - - if (!irqd) - return; - - desc = irq_data_get_msi_desc(irqd); - if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) - return; - - hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); + (void)hv_unmap_msi_interrupt(dev, &old_entry); } /* @@ -298,37 +296,93 @@ static void hv_msi_free_irq(struct irq_domain *domain, */ static struct irq_chip hv_pci_msi_controller = { .name = "HV-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = hv_irq_compose_msi_msg, - .irq_set_affinity = msi_domain_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_set_affinity = irq_chip_set_affinity_parent, }; -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_free = hv_msi_free_irq, - .msi_prepare = pci_msi_prepare, +static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; + + info->ops->msi_prepare = pci_msi_prepare; + + return true; +} + +#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) +#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + +static struct msi_parent_ops hv_msi_parent_ops = { + .supported_flags = HV_MSI_FLAGS_SUPPORTED, + .required_flags = HV_MSI_FLAGS_REQUIRED, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .prefix = "HV-", + .init_dev_msi_info = hv_init_dev_msi_info, }; -static struct msi_domain_info hv_pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &hv_pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", +static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + /* + * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except + * entry_to_msi_msg() should be in here. + */ + + int ret; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); + if (ret) + return ret; + + for (int i = 0; i < nr_irqs; ++i) { + irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL, + handle_edge_irq, NULL, "edge"); + } + return 0; +} + +static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) +{ + for (int i = 0; i < nr_irqs; ++i) { + struct irq_data *irqd = irq_domain_get_irq_data(d, virq); + struct msi_desc *desc; + + desc = irq_data_get_msi_desc(irqd); + if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) + continue; + + hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); + } + irq_domain_free_irqs_top(d, virq, nr_irqs); +} + +static const struct irq_domain_ops hv_msi_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = hv_msi_domain_alloc, + .free = hv_msi_domain_free, }; struct irq_domain * __init hv_create_pci_msi_domain(void) { struct irq_domain *d = NULL; - struct fwnode_handle *fn; - fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); - if (fn) - d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), + .ops = &hv_msi_domain_ops, + .parent = x86_vector_domain, + }; + + if (info.fwnode) + d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); /* No point in going further if we can't get an irq domain */ BUG_ON(!d); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 28be6df88063..651771534cae 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -7,17 +7,25 @@ */ #include <linux/bitfield.h> -#include <linux/hyperv.h> #include <linux/types.h> #include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/export.h> #include <asm/svm.h> #include <asm/sev.h> #include <asm/io.h> #include <asm/coco.h> #include <asm/mem_encrypt.h> +#include <asm/set_memory.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/mtrr.h> +#include <asm/io_apic.h> +#include <asm/realmode.h> +#include <asm/e820/api.h> +#include <asm/desc.h> +#include <asm/msr.h> +#include <uapi/asm/vmx.h> #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -56,8 +64,10 @@ union hv_ghcb { } hypercall; } __packed __aligned(HV_HYP_PAGE_SIZE); +/* Only used in an SNP VM with the paravisor */ static u16 hv_ghcb_version __ro_after_init; +/* Functions only used in an SNP VM with the paravisor go here. */ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) { union hv_ghcb *hv_ghcb; @@ -103,12 +113,12 @@ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) static inline u64 rd_ghcb_msr(void) { - return __rdmsr(MSR_AMD64_SEV_ES_GHCB); + return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB); } static inline void wr_ghcb_msr(u64 val) { - native_wrmsrl(MSR_AMD64_SEV_ES_GHCB, val); + native_wrmsrq(MSR_AMD64_SEV_ES_GHCB, val); } static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, @@ -137,7 +147,7 @@ void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason) /* Tell the hypervisor what went wrong. */ val |= GHCB_SEV_TERM_REASON(set, reason); - /* Request Guest Termination from Hypvervisor */ + /* Request Guest Termination from Hypervisor */ wr_ghcb_msr(val); VMGEXIT(); @@ -175,7 +185,7 @@ bool hv_ghcb_negotiate_protocol(void) return true; } -void hv_ghcb_msr_write(u64 msr, u64 value) +static void hv_ghcb_msr_write(u64 msr, u64 value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -203,9 +213,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_write); -void hv_ghcb_msr_read(u64 msr, u64 *value) +static void hv_ghcb_msr_read(u64 msr, u64 *value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -235,7 +244,427 @@ void hv_ghcb_msr_read(u64 msr, u64 *value) | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); + +/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */ +static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE); +static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa); + +/* Functions only used in an SNP VM without the paravisor go here. */ + +#define hv_populate_vmcb_seg(seg, gdtr_base) \ +do { \ + if (seg.selector) { \ + seg.base = 0; \ + seg.limit = HV_AP_SEGMENT_LIMIT; \ + seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \ + seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \ + } \ +} while (0) \ + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu) +{ + struct sev_es_save_area *vmsa = (struct sev_es_save_area *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + struct sev_es_save_area *cur_vmsa; + struct desc_ptr gdtr; + u64 ret, retry = 5; + struct hv_enable_vp_vtl *start_vp_input; + unsigned long flags; + int vp_index; + + if (!vmsa) + return -ENOMEM; + + /* Find the Hyper-V VP index which might be not the same as APIC ID */ + vp_index = hv_apicid_to_vp_index(apic_id); + if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index) + return -EINVAL; + + native_store_gdt(&gdtr); + + vmsa->gdtr.base = gdtr.address; + vmsa->gdtr.limit = gdtr.size; + + asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); + hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); + + asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); + hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); + + asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); + hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); + + asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); + hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); + + vmsa->efer = native_read_msr(MSR_EFER); + + vmsa->cr4 = native_read_cr4(); + vmsa->cr3 = __native_read_cr3(); + vmsa->cr0 = native_read_cr0(); + + vmsa->xcr0 = 1; + vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT; + vmsa->rip = (u64)secondary_startup_64_no_verify; + vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE]; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); + free_page((u64)vmsa); + return ret; + } + + local_irq_save(flags); + start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; + memset(start_vp_input, 0, sizeof(*start_vp_input)); + start_vp_input->partition_id = -1; + start_vp_input->vp_index = vp_index; + start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; + *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; + + do { + ret = hv_do_hypercall(HVCALL_START_VP, + start_vp_input, NULL); + } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--); + + local_irq_restore(flags); + + if (!hv_result_success(ret)) { + pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret); + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + cur_vmsa = per_cpu(hv_sev_vmsa, cpu); + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(hv_sev_vmsa, cpu) = vmsa; + + return ret; +} + +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) +{ + u64 hv_status; + + register u64 __r8 asm("r8") = param2; + asm volatile("vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); + + return hv_status; +} + +#else +static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} +static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_INTEL_TDX_GUEST +static void hv_tdx_msr_write(u64 msr, u64 val) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_WRITE, + .r12 = msr, + .r13 = val, + }; + + u64 ret = __tdx_hypercall(&args); + + WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret); +} + +static void hv_tdx_msr_read(u64 msr, u64 *val) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_READ, + .r12 = msr, + }; + + u64 ret = __tdx_hypercall(&args); + + if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) + *val = 0; + else + *val = args.r11; +} + +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + struct tdx_module_args args = { }; + + args.r10 = control; + args.rdx = param1; + args.r8 = param2; + + (void)__tdx_hypercall(&args); + + return args.r11; +} + +#else +static inline void hv_tdx_msr_write(u64 msr, u64 value) {} +static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_ivm_msr_write(u64 msr, u64 value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_write(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_write(msr, value); +} + +void hv_ivm_msr_read(u64 msr, u64 *value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_read(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_read(msr, value); +} + +/* + * Keep track of the PFN regions which were shared with the host. The access + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). + */ +struct hv_enc_pfn_region { + struct list_head list; + u64 pfn; + int count; +}; + +static LIST_HEAD(hv_list_enc); +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); + +static int hv_list_enc_add(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + /* Check if the PFN already exists in some region first */ + list_for_each_entry(ent, &hv_list_enc, list) { + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) + /* Nothing to do - pfn is already in the list */ + goto unlock_done; + } + + /* + * Check if the PFN is adjacent to an existing region. Growing + * a region can make it adjacent to another one but merging is + * not (yet) implemented for simplicity. A PFN cannot be added + * to two regions to keep the logic in hv_list_enc_remove() + * correct. + */ + list_for_each_entry(ent, &hv_list_enc, list) { + if (ent->pfn + ent->count == pfn) { + /* Grow existing region up */ + ent->count++; + goto unlock_done; + } else if (pfn + 1 == ent->pfn) { + /* Grow existing region down */ + ent->pfn--; + ent->count++; + goto unlock_done; + } + } + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + /* No adjacent region found -- create a new one */ + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = pfn; + ent->count = 1; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +static int hv_list_enc_remove(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent, *t; + struct hv_enc_pfn_region new_region; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { + if (pfn == ent->pfn + ent->count - 1) { + /* Removing tail pfn */ + ent->count--; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn == ent->pfn) { + /* Removing head pfn */ + ent->count--; + ent->pfn++; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) { + /* + * Removing a pfn in the middle. Cut off the tail + * of the existing region and create a template for + * the new one. + */ + new_region.pfn = pfn + 1; + new_region.count = ent->count - (pfn - ent->pfn + 1); + ent->count = pfn - ent->pfn; + goto unlock_split; + } + + } +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + continue; + +unlock_split: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = new_region.pfn; + ent->count = new_region.count; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +/* Stop new private<->shared conversions */ +static void hv_vtom_kexec_begin(void) +{ + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + /* + * Crash kernel reaches here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +static void hv_vtom_kexec_finish(void) +{ + struct hv_gpa_range_for_visibility *input; + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 hv_status; + int cur, i; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + if (unlikely(!input)) + goto out; + + list_for_each_entry(ent, &hv_list_enc, list) { + for (i = 0, cur = 0; i < ent->count; i++) { + input->gpa_page_list[cur] = ent->pfn + i; + cur++; + + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; + input->reserved0 = 0; + input->reserved1 = 0; + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, + cur, 0, input, NULL); + WARN_ON_ONCE(!hv_result_success(hv_status)); + cur = 0; + } + } + + } + +out: + local_irq_restore(flags); +} /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. @@ -248,9 +677,9 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], enum hv_mem_host_visibility visibility) { struct hv_gpa_range_for_visibility *input; - u16 pages_processed; u64 hv_status; unsigned long flags; + int ret; /* no-op if partition isolation is not enabled */ if (!hv_is_isolation_supported()) @@ -262,6 +691,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EINVAL; } + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_remove(pfn, count); + else + ret = hv_list_enc_add(pfn, count); + if (ret) + return ret; + local_irq_save(flags); input = *this_cpu_ptr(hyperv_pcpu_input_arg); @@ -277,13 +713,48 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn)); hv_status = hv_do_rep_hypercall( HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count, - 0, input, &pages_processed); + 0, input, NULL); local_irq_restore(flags); if (hv_result_success(hv_status)) return 0; + + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_add(pfn, count); else - return -EFAULT; + ret = hv_list_enc_remove(pfn, count); + /* + * There's no good way to recover from -ENOMEM here, the accounting is + * wrong either way. + */ + WARN_ON_ONCE(ret); + + return -EFAULT; +} + +/* + * When transitioning memory between encrypted and decrypted, the caller + * of set_memory_encrypted() or set_memory_decrypted() is responsible for + * ensuring that the memory isn't in use and isn't referenced while the + * transition is in progress. The transition has multiple steps, and the + * memory is in an inconsistent state until all steps are complete. A + * reference while the state is inconsistent could result in an exception + * that can't be cleanly fixed up. + * + * But the Linux kernel load_unaligned_zeropad() mechanism could cause a + * stray reference that can't be prevented by the caller, so Linux has + * specific code to handle this case. But when the #VC and #VE exceptions + * routed to a paravisor, the specific code doesn't work. To avoid this + * problem, mark the pages as "not present" while the transition is in + * progress. If load_unaligned_zeropad() causes a stray reference, a normal + * page fault is generated instead of #VC or #VE, and the page-fault-based + * handlers for load_unaligned_zeropad() resolve the reference. When the + * transition is complete, hv_vtom_set_host_visibility() marks the pages + * as "present" again. + */ +static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) +{ + return set_memory_np(kbuffer, pagecount); } /* @@ -294,42 +765,68 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], * with host. This function works as wrap of hv_mark_gpa_visibility() * with memory base and size. */ -static bool hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) +static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) { enum hv_mem_host_visibility visibility = enc ? VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; u64 *pfn_array; + phys_addr_t paddr; + int i, pfn, err; + void *vaddr; int ret = 0; - bool result = true; - int i, pfn; pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); - if (!pfn_array) - return false; + if (!pfn_array) { + ret = -ENOMEM; + goto err_set_memory_p; + } for (i = 0, pfn = 0; i < pagecount; i++) { - pfn_array[pfn] = virt_to_hvpfn((void *)kbuffer + i * HV_HYP_PAGE_SIZE); + /* + * Use slow_virt_to_phys() because the PRESENT bit has been + * temporarily cleared in the PTEs. slow_virt_to_phys() works + * without the PRESENT bit while virt_to_hvpfn() or similar + * does not. + */ + vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE); + paddr = slow_virt_to_phys(vaddr); + pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT; pfn++; if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { ret = hv_mark_gpa_visibility(pfn, pfn_array, visibility); - if (ret) { - result = false; + if (ret) goto err_free_pfn_array; - } pfn = 0; } } - err_free_pfn_array: +err_free_pfn_array: kfree(pfn_array); - return result; + +err_set_memory_p: + /* + * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present() + * did. Do this even if there is an error earlier in this function in + * order to avoid leaving the memory range in a "broken" state. Setting + * the PRESENT bits shouldn't fail, but return an error if it does. + */ + err = set_memory_p(kbuffer, pagecount); + if (err && !ret) + ret = err; + + return ret; } static bool hv_vtom_tlb_flush_required(bool private) { - return true; + /* + * Since hv_vtom_clear_present() marks the PTEs as "not present" + * and flushes the TLB, they can't be in the TLB. That makes the + * flush controlled by this function redundant, so return "false". + */ + return false; } static bool hv_vtom_cache_flush_required(void) @@ -358,26 +855,50 @@ static bool hv_is_private_mmio(u64 addr) void __init hv_vtom_init(void) { + enum hv_isolation_type type = hv_get_isolation_type(); + + switch (type) { + case HV_ISOLATION_TYPE_VBS: + fallthrough; /* * By design, a VM using vTOM doesn't see the SEV setting, * so SEV initialization is bypassed and sev_status isn't set. * Set it here to indicate a vTOM VM. + * + * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is + * defined as 0ULL, to which we can't assigned a value. */ - sev_status = MSR_AMD64_SNP_VTOM; - cc_vendor = CC_VENDOR_AMD; +#ifdef CONFIG_AMD_MEM_ENCRYPT + case HV_ISOLATION_TYPE_SNP: + sev_status = MSR_AMD64_SNP_VTOM; + cc_vendor = CC_VENDOR_AMD; + break; +#endif + + case HV_ISOLATION_TYPE_TDX: + cc_vendor = CC_VENDOR_INTEL; + break; + + default: + panic("hv_vtom_init: unsupported isolation type %d\n", type); + } + cc_set_mask(ms_hyperv.shared_gpa_boundary); physical_mask &= ms_hyperv.shared_gpa_boundary - 1; x86_platform.hyper.is_private_mmio = hv_is_private_mmio; x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; /* Set WB as the default cache mode. */ - mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); } -#endif /* CONFIG_AMD_MEM_ENCRYPT */ +#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ enum hv_isolation_type hv_get_isolation_type(void) { @@ -405,10 +926,20 @@ bool hv_is_isolation_supported(void) DEFINE_STATIC_KEY_FALSE(isolation_type_snp); /* - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based * isolation VM. */ bool hv_isolation_type_snp(void) { return static_branch_unlikely(&isolation_type_snp); } + +DEFINE_STATIC_KEY_FALSE(isolation_type_tdx); +/* + * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based + * isolated VM. + */ +bool hv_isolation_type_tdx(void) +{ + return static_branch_unlikely(&isolation_type_tdx); +} diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 1cc113200ff5..cfcb60468b01 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -1,6 +1,5 @@ #define pr_fmt(fmt) "Hyper-V: " fmt -#include <linux/hyperv.h> #include <linux/log2.h> #include <linux/slab.h> #include <linux/types.h> @@ -206,6 +205,10 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, /* * We can flush not more than max_gvas with one hypercall. Flush the * whole address space if we were asked to do more. + * + * For these hypercalls, Hyper-V treats the valid_bank_mask field + * of flush->hv_vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. */ max_gvas = (PAGE_SIZE - sizeof(*flush) - nr_bank * @@ -240,5 +243,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c new file mode 100644 index 000000000000..882c1db6df16 --- /dev/null +++ b/arch/x86/hyperv/mshv-asm-offsets.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain <namjain@microsoft.com> + */ +#define COMPILE_OFFSETS + +#include <linux/kbuild.h> +#include <asm/mshyperv.h> + +static void __used common(void) +{ + if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) { + OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax); + OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp); + OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi); + OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8); + OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9); + OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10); + OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11); + OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12); + OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13); + OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14); + OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15); + OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2); + } +} diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S new file mode 100644 index 000000000000..f595eefad9ab --- /dev/null +++ b/arch/x86/hyperv/mshv_vtl_asm.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Assembly level code for mshv_vtl VTL transition + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain <namjain@microsoft.com> + */ + +#include <linux/linkage.h> +#include <linux/static_call_types.h> +#include <asm/asm.h> +#include <asm/asm-offsets.h> +#include <asm/frame.h> +#include "mshv-asm-offsets.h" + + .text + .section .noinstr.text, "ax" +/* + * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) + * + * This function is used to context switch between different Virtual Trust Levels. + * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities. + * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard + * against #PFs in NMI context clobbering the guest state. + */ +SYM_FUNC_START(__mshv_vtl_return_call) + /* Push callee save registers */ + pushq %rbp + mov %rsp, %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + + /* register switch to VTL0 clobbers all registers except rax/rcx */ + mov %_ASM_ARG1, %rax + + /* grab rbx/rbp/rsi/rdi/r8-r15 */ + mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx + mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp + mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi + mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi + mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8 + mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9 + mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10 + mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11 + mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12 + mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13 + mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14 + mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15 + + mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx + mov %rdx, %cr2 + mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx + + /* stash host registers on stack */ + pushq %rax + pushq %rcx + + xor %ecx, %ecx + + /* make a hypercall to switch VTL */ + call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall) + + /* stash guest registers on stack, restore saved host copies */ + pushq %rax + pushq %rcx + mov 16(%rsp), %rcx + mov 24(%rsp), %rax + + mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax) + mov %cr2, %rdx + mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax) + pop MSHV_VTL_CPU_CONTEXT_rcx(%rax) + pop MSHV_VTL_CPU_CONTEXT_rax(%rax) + add $16, %rsp + + /* save rbx/rbp/rsi/rdi/r8-r15 */ + mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax) + mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax) + mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax) + mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax) + mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax) + mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax) + mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax) + mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax) + mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax) + mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax) + mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax) + mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax) + + /* pop callee-save registers r12-r15, rbx */ + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + pop %rbp + RET +SYM_FUNC_END(__mshv_vtl_return_call) +/* + * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here. + * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid + * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0" + * which would otherwise have been generated by the macro. + */ + .section .discard.addressable,"aw" + .align 8 + .type mshv_vtl_return_sym, @object + .size mshv_vtl_return_sym, 8 +mshv_vtl_return_sym: + .quad __SCK____mshv_vtl_return_hypercall diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index 9dc259fa322e..8ccbb7c4fc27 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -11,7 +11,8 @@ #include <linux/types.h> -#include <asm/hyperv-tlfs.h> +#include <linux/export.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> |
