diff options
Diffstat (limited to 'arch/x86/hyperv')
| -rw-r--r-- | arch/x86/hyperv/Makefile | 18 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_apic.c | 194 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_crash.c | 642 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_init.c | 708 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_spinlock.c | 24 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_trampoline.S | 101 | ||||
| -rw-r--r-- | arch/x86/hyperv/hv_vtl.c | 281 | ||||
| -rw-r--r-- | arch/x86/hyperv/irqdomain.c | 418 | ||||
| -rw-r--r-- | arch/x86/hyperv/ivm.c | 945 | ||||
| -rw-r--r-- | arch/x86/hyperv/mmu.c | 68 | ||||
| -rw-r--r-- | arch/x86/hyperv/mshv-asm-offsets.c | 37 | ||||
| -rw-r--r-- | arch/x86/hyperv/mshv_vtl_asm.S | 116 | ||||
| -rw-r--r-- | arch/x86/hyperv/nested.c | 22 |
13 files changed, 3231 insertions, 343 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index 1c11f9420a82..56292102af62 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,6 +1,22 @@ -obj-y := hv_init.o mmu.o nested.o +# SPDX-License-Identifier: GPL-2.0-only +obj-y := hv_init.o mmu.o nested.o irqdomain.o ivm.o obj-$(CONFIG_X86_64) += hv_apic.o +obj-$(CONFIG_HYPERV_VTL_MODE) += hv_vtl.o mshv_vtl_asm.o + +$(obj)/mshv_vtl_asm.o: $(obj)/mshv-asm-offsets.h + +$(obj)/mshv-asm-offsets.h: $(obj)/mshv-asm-offsets.s FORCE + $(call filechk,offsets,__MSHV_ASM_OFFSETS_H__) ifdef CONFIG_X86_64 obj-$(CONFIG_PARAVIRT_SPINLOCKS) += hv_spinlock.o + + ifdef CONFIG_MSHV_ROOT + CFLAGS_REMOVE_hv_trampoline.o += -pg + CFLAGS_hv_trampoline.o += -fno-stack-protector + obj-$(CONFIG_CRASH_DUMP) += hv_crash.o hv_trampoline.o + endif endif + +targets += mshv-asm-offsets.s +clean-files += mshv-asm-offsets.h diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 8eb6fbee8e13..a8de503def37 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -23,12 +23,12 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/clockchips.h> -#include <linux/hyperv.h> #include <linux/slab.h> #include <linux/cpuhotplug.h> #include <asm/hypervisor.h> #include <asm/mshyperv.h> #include <asm/apic.h> +#include <asm/msr.h> #include <asm/trace/hyperv.h> @@ -38,7 +38,7 @@ static u64 hv_apic_icr_read(void) { u64 reg_val; - rdmsrl(HV_X64_MSR_ICR, reg_val); + rdmsrq(HV_X64_MSR_ICR, reg_val); return reg_val; } @@ -46,11 +46,16 @@ static void hv_apic_icr_write(u32 low, u32 id) { u64 reg_val; - reg_val = SET_APIC_DEST_FIELD(id); + reg_val = SET_XAPIC_DEST_FIELD(id); reg_val = reg_val << 32; reg_val |= low; - wrmsrl(HV_X64_MSR_ICR, reg_val); + wrmsrq(HV_X64_MSR_ICR, reg_val); +} + +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set) +{ + apic_update_vector(cpu, vector, set); } static u32 hv_apic_read(u32 reg) @@ -60,9 +65,11 @@ static u32 hv_apic_read(u32 reg) switch (reg) { case APIC_EOI: rdmsr(HV_X64_MSR_EOI, reg_val, hi); + (void)hi; return reg_val; case APIC_TASKPRI: rdmsr(HV_X64_MSR_TPR, reg_val, hi); + (void)hi; return reg_val; default: @@ -74,39 +81,48 @@ static void hv_apic_write(u32 reg, u32 val) { switch (reg) { case APIC_EOI: - wrmsr(HV_X64_MSR_EOI, val, 0); + wrmsrq(HV_X64_MSR_EOI, val); break; case APIC_TASKPRI: - wrmsr(HV_X64_MSR_TPR, val, 0); + wrmsrq(HV_X64_MSR_TPR, val); break; default: native_apic_mem_write(reg, val); } } -static void hv_apic_eoi_write(u32 reg, u32 val) +static void hv_apic_eoi_write(void) +{ + struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; + + if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) + return; + + wrmsrq(HV_X64_MSR_EOI, APIC_EOI_ACK); +} + +static bool cpu_is_self(int cpu) { - wrmsr(HV_X64_MSR_EOI, val, 0); + return cpu == smp_processor_id(); } /* * IPI implementation on Hyper-V. */ -static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) +static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector, + bool exclude_self) { - struct hv_send_ipi_ex **arg; struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; - int ret = 1; + u64 status = HV_STATUS_INVALID_PARAMETER; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) return false; local_irq_save(flags); - arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + ipi_arg = *this_cpu_ptr(hyperv_pcpu_input_arg); - ipi_arg = *arg; if (unlikely(!ipi_arg)) goto ipi_mask_ex_done; @@ -114,38 +130,68 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) ipi_arg->reserved = 0; ipi_arg->vp_set.valid_bank_mask = 0; - if (!cpumask_equal(mask, cpu_present_mask)) { + /* + * Use HV_GENERIC_SET_ALL and avoid converting cpumask to VP_SET + * when the IPI is sent to all currently present CPUs. + */ + if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) { ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask); - } - if (nr_bank < 0) - goto ipi_mask_ex_done; - if (!nr_bank) + + nr_bank = cpumask_to_vpset_skip(&ipi_arg->vp_set, mask, + exclude_self ? cpu_is_self : NULL); + + /* + * 'nr_bank <= 0' means some CPUs in cpumask can't be + * represented in VP_SET. Return an error and fall back to + * native (architectural) method of sending IPIs. + */ + if (nr_bank <= 0) + goto ipi_mask_ex_done; + } else { ipi_arg->vp_set.format = HV_GENERIC_SET_ALL; + } - ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, - ipi_arg, NULL); + /* + * For this hypercall, Hyper-V treats the valid_bank_mask field + * of ipi_arg->vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. + */ + status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank, + ipi_arg, NULL); ipi_mask_ex_done: local_irq_restore(flags); - return ((ret == 0) ? true : false); + return hv_result_success(status); } -static bool __send_ipi_mask(const struct cpumask *mask, int vector) +static bool __send_ipi_mask(const struct cpumask *mask, int vector, + bool exclude_self) { - int cur_cpu, vcpu; + int cur_cpu, vcpu, this_cpu = smp_processor_id(); struct hv_send_ipi ipi_arg; - int ret = 1; + u64 status; + unsigned int weight; trace_hyperv_send_ipi_mask(mask, vector); - if (cpumask_empty(mask)) + weight = cpumask_weight(mask); + + /* + * Do nothing if + * 1. the mask is empty + * 2. the mask only contains self when exclude_self is true + */ + if (weight == 0 || + (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; - if (!hv_hypercall_pg) - return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } - if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) return false; /* @@ -165,13 +211,15 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) ipi_arg.cpu_mask = 0; for_each_cpu(cur_cpu, mask) { + if (exclude_self && cur_cpu == this_cpu) + continue; vcpu = hv_cpu_number_to_vp_number(cur_cpu); if (vcpu == VP_INVAL) return false; /* * This particular version of the IPI hypercall can - * only target upto 64 CPUs. + * only target up to 64 CPUs. */ if (vcpu >= 64) goto do_ex_hypercall; @@ -179,20 +227,38 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector) __set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask); } - ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, - ipi_arg.cpu_mask); - return ((ret == 0) ? true : false); + status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector, + ipi_arg.cpu_mask); + return hv_result_success(status); do_ex_hypercall: - return __send_ipi_mask_ex(mask, vector); + return __send_ipi_mask_ex(mask, vector, exclude_self); } static bool __send_ipi_one(int cpu, int vector) { - struct cpumask mask = CPU_MASK_NONE; + int vp = hv_cpu_number_to_vp_number(cpu); + u64 status; + + trace_hyperv_send_ipi_one(cpu, vector); + + if (vp == VP_INVAL) + return false; - cpumask_set_cpu(cpu, &mask); - return __send_ipi_mask(&mask, vector); + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } + + if (vector < HV_IPI_LOW_VECTOR || vector > HV_IPI_HIGH_VECTOR) + return false; + + if (vp >= 64) + return __send_ipi_mask_ex(cpumask_of(cpu), vector, false); + + status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); + return hv_result_success(status); } static void hv_send_ipi(int cpu, int vector) @@ -203,20 +269,13 @@ static void hv_send_ipi(int cpu, int vector) static void hv_send_ipi_mask(const struct cpumask *mask, int vector) { - if (!__send_ipi_mask(mask, vector)) + if (!__send_ipi_mask(mask, vector, false)) orig_apic.send_IPI_mask(mask, vector); } static void hv_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned int this_cpu = smp_processor_id(); - struct cpumask new_mask; - const struct cpumask *local_mask; - - cpumask_copy(&new_mask, mask); - cpumask_clear_cpu(this_cpu, &new_mask); - local_mask = &new_mask; - if (!__send_ipi_mask(local_mask, vector)) + if (!__send_ipi_mask(mask, vector, true)) orig_apic.send_IPI_mask_allbutself(mask, vector); } @@ -227,7 +286,7 @@ static void hv_send_ipi_allbutself(int vector) static void hv_send_ipi_all(int vector) { - if (!__send_ipi_mask(cpu_online_mask, vector)) + if (!__send_ipi_mask(cpu_online_mask, vector, false)) orig_apic.send_IPI_all(vector); } @@ -239,6 +298,9 @@ static void hv_send_ipi_self(int vector) void __init hv_apic_init(void) { + if (cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return; + if (ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) { pr_info("Hyper-V: Using IPI hypercalls\n"); /* @@ -246,20 +308,34 @@ void __init hv_apic_init(void) */ orig_apic = *apic; - apic->send_IPI = hv_send_ipi; - apic->send_IPI_mask = hv_send_ipi_mask; - apic->send_IPI_mask_allbutself = hv_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = hv_send_ipi_allbutself; - apic->send_IPI_all = hv_send_ipi_all; - apic->send_IPI_self = hv_send_ipi_self; + apic_update_callback(send_IPI, hv_send_ipi); + apic_update_callback(send_IPI_mask, hv_send_ipi_mask); + apic_update_callback(send_IPI_mask_allbutself, hv_send_ipi_mask_allbutself); + apic_update_callback(send_IPI_allbutself, hv_send_ipi_allbutself); + apic_update_callback(send_IPI_all, hv_send_ipi_all); + apic_update_callback(send_IPI_self, hv_send_ipi_self); } if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { - pr_info("Hyper-V: Using MSR based APIC access\n"); - apic_set_eoi_write(hv_apic_eoi_write); - apic->read = hv_apic_read; - apic->write = hv_apic_write; - apic->icr_write = hv_apic_icr_write; - apic->icr_read = hv_apic_icr_read; + pr_info("Hyper-V: Using enlightened APIC (%s mode)", + x2apic_enabled() ? "x2apic" : "xapic"); + /* + * When in x2apic mode, don't use the Hyper-V specific APIC + * accessors since the field layout in the ICR register is + * different in x2apic mode. Furthermore, the architectural + * x2apic MSRs function just as well as the Hyper-V + * synthetic APIC MSRs, so there's no benefit in having + * separate Hyper-V accessors for x2apic mode. The only + * exception is hv_apic_eoi_write, because it benefits from + * lazy EOI when available, but the same accessor works for + * both xapic and x2apic because the field layout is the same. + */ + apic_update_callback(eoi, hv_apic_eoi_write); + if (!x2apic_enabled()) { + apic_update_callback(read, hv_apic_read); + apic_update_callback(write, hv_apic_write); + apic_update_callback(icr_write, hv_apic_icr_write); + apic_update_callback(icr_read, hv_apic_icr_read); + } } } diff --git a/arch/x86/hyperv/hv_crash.c b/arch/x86/hyperv/hv_crash.c new file mode 100644 index 000000000000..c0e22921ace1 --- /dev/null +++ b/arch/x86/hyperv/hv_crash.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * X86 specific Hyper-V root partition kdump/crash support module + * + * Copyright (C) 2025, Microsoft, Inc. + * + * This module implements hypervisor RAM collection into vmcore for both + * cases of the hypervisor crash and Linux root crash. Hyper-V implements + * a disable hypercall with a 32bit protected mode ABI callback. This + * mechanism must be used to unlock hypervisor RAM. Since the hypervisor RAM + * is already mapped in Linux, it is automatically collected into Linux vmcore, + * and can be examined by the crash command (raw RAM dump) or windbg. + * + * At a high level: + * + * Hypervisor Crash: + * Upon crash, hypervisor goes into an emergency minimal dispatch loop, a + * restrictive mode with very limited hypercall and MSR support. Each cpu + * then injects NMIs into root vcpus. A shared page is used to check + * by Linux in the NMI handler if the hypervisor has crashed. This shared + * page is setup in hv_root_crash_init during boot. + * + * Linux Crash: + * In case of Linux crash, the callback hv_crash_stop_other_cpus will send + * NMIs to all cpus, then proceed to the crash_nmi_callback where it waits + * for all cpus to be in NMI. + * + * NMI Handler (upon quorum): + * Eventually, in both cases, all cpus will end up in the NMI handler. + * Hyper-V requires the disable hypervisor must be done from the BSP. So + * the BSP NMI handler saves current context, does some fixups and makes + * the hypercall to disable the hypervisor, ie, devirtualize. Hypervisor + * at that point will suspend all vcpus (except the BSP), unlock all its + * RAM, and return to Linux at the 32bit mode entry RIP. + * + * Linux 32bit entry trampoline will then restore long mode and call C + * function here to restore context and continue execution to crash kexec. + */ + +#include <linux/delay.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <linux/panic.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/mshyperv.h> +#include <asm/nmi.h> +#include <asm/idtentry.h> +#include <asm/reboot.h> +#include <asm/intel_pt.h> + +bool hv_crash_enabled; +EXPORT_SYMBOL_GPL(hv_crash_enabled); + +struct hv_crash_ctxt { + ulong rsp; + ulong cr0; + ulong cr2; + ulong cr4; + ulong cr8; + + u16 cs; + u16 ss; + u16 ds; + u16 es; + u16 fs; + u16 gs; + + u16 gdt_fill; + struct desc_ptr gdtr; + char idt_fill[6]; + struct desc_ptr idtr; + + u64 gsbase; + u64 efer; + u64 pat; +}; +static struct hv_crash_ctxt hv_crash_ctxt; + +/* Shared hypervisor page that contains crash dump area we peek into. + * NB: windbg looks for "hv_cda" symbol so don't change it. + */ +static struct hv_crashdump_area *hv_cda; + +static u32 trampoline_pa, devirt_arg; +static atomic_t crash_cpus_wait; +static void *hv_crash_ptpgs[4]; +static bool hv_has_crashed, lx_has_crashed; + +static void __noreturn hv_panic_timeout_reboot(void) +{ + #define PANIC_TIMER_STEP 100 + + if (panic_timeout > 0) { + int i; + + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) + mdelay(PANIC_TIMER_STEP); + } + + if (panic_timeout) + native_wrmsrq(HV_X64_MSR_RESET, 1); /* get hyp to reboot */ + + for (;;) + cpu_relax(); +} + +/* This cannot be inlined as it needs stack */ +static noinline __noclone void hv_crash_restore_tss(void) +{ + load_TR_desc(); +} + +/* This cannot be inlined as it needs stack */ +static noinline void hv_crash_clear_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + /* Clear entry so it's not confusing to someone looking at the core */ + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + native_p4d_clear(p4d); +} + +/* + * This is the C entry point from the asm glue code after the disable hypercall. + * We enter here in IA32-e long mode, ie, full 64bit mode running on kernel + * page tables with our below 4G page identity mapped, but using a temporary + * GDT. ds/fs/gs/es are null. ss is not usable. bp is null. stack is not + * available. We restore kernel GDT, and rest of the context, and continue + * to kexec. + */ +static asmlinkage void __noreturn hv_crash_c_entry(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + /* first thing, restore kernel gdt */ + native_load_gdt(&ctxt->gdtr); + + asm volatile("movw %%ax, %%ss" : : "a"(ctxt->ss)); + asm volatile("movq %0, %%rsp" : : "m"(ctxt->rsp)); + + asm volatile("movw %%ax, %%ds" : : "a"(ctxt->ds)); + asm volatile("movw %%ax, %%es" : : "a"(ctxt->es)); + asm volatile("movw %%ax, %%fs" : : "a"(ctxt->fs)); + asm volatile("movw %%ax, %%gs" : : "a"(ctxt->gs)); + + native_wrmsrq(MSR_IA32_CR_PAT, ctxt->pat); + asm volatile("movq %0, %%cr0" : : "r"(ctxt->cr0)); + + asm volatile("movq %0, %%cr8" : : "r"(ctxt->cr8)); + asm volatile("movq %0, %%cr4" : : "r"(ctxt->cr4)); + asm volatile("movq %0, %%cr2" : : "r"(ctxt->cr4)); + + native_load_idt(&ctxt->idtr); + native_wrmsrq(MSR_GS_BASE, ctxt->gsbase); + native_wrmsrq(MSR_EFER, ctxt->efer); + + /* restore the original kernel CS now via far return */ + asm volatile("movzwq %0, %%rax\n\t" + "pushq %%rax\n\t" + "pushq $1f\n\t" + "lretq\n\t" + "1:nop\n\t" : : "m"(ctxt->cs) : "rax"); + + /* We are in asmlinkage without stack frame, hence make C function + * calls which will buy stack frames. + */ + hv_crash_restore_tss(); + hv_crash_clear_kernpt(); + + /* we are now fully in devirtualized normal kernel mode */ + __crash_kexec(NULL); + + hv_panic_timeout_reboot(); +} +/* Tell gcc we are using lretq long jump in the above function intentionally */ +STACK_FRAME_NON_STANDARD(hv_crash_c_entry); + +static void hv_mark_tss_not_busy(void) +{ + struct desc_struct *desc = get_current_gdt_rw(); + tss_desc tss; + + memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc)); + tss.type = 0x9; /* available 64-bit TSS. 0xB is busy TSS */ + write_gdt_entry(desc, GDT_ENTRY_TSS, &tss, DESC_TSS); +} + +/* Save essential context */ +static void hv_hvcrash_ctxt_save(void) +{ + struct hv_crash_ctxt *ctxt = &hv_crash_ctxt; + + asm volatile("movq %%rsp,%0" : "=m"(ctxt->rsp)); + + ctxt->cr0 = native_read_cr0(); + ctxt->cr4 = native_read_cr4(); + + asm volatile("movq %%cr2, %0" : "=a"(ctxt->cr2)); + asm volatile("movq %%cr8, %0" : "=a"(ctxt->cr8)); + + asm volatile("movl %%cs, %%eax" : "=a"(ctxt->cs)); + asm volatile("movl %%ss, %%eax" : "=a"(ctxt->ss)); + asm volatile("movl %%ds, %%eax" : "=a"(ctxt->ds)); + asm volatile("movl %%es, %%eax" : "=a"(ctxt->es)); + asm volatile("movl %%fs, %%eax" : "=a"(ctxt->fs)); + asm volatile("movl %%gs, %%eax" : "=a"(ctxt->gs)); + + native_store_gdt(&ctxt->gdtr); + store_idt(&ctxt->idtr); + + ctxt->gsbase = __rdmsr(MSR_GS_BASE); + ctxt->efer = __rdmsr(MSR_EFER); + ctxt->pat = __rdmsr(MSR_IA32_CR_PAT); +} + +/* Add trampoline page to the kernel pagetable for transition to kernel PT */ +static void hv_crash_fixup_kernpt(void) +{ + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset_k(trampoline_pa); + p4d = p4d_offset(pgd, trampoline_pa); + + /* trampoline_pa is below 4G, so no pre-existing entry to clobber */ + p4d_populate(&init_mm, p4d, (pud_t *)hv_crash_ptpgs[1]); + p4d->p4d = p4d->p4d & ~(_PAGE_NX); /* enable execute */ +} + +/* + * Notify the hyp that Linux has crashed. This will cause the hyp to quiesce + * and suspend all guest VPs. + */ +static void hv_notify_prepare_hyp(void) +{ + u64 status; + struct hv_input_notify_partition_event *input; + struct hv_partition_event_root_crashdump_input *cda; + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + cda = &input->input.crashdump_input; + memset(input, 0, sizeof(*input)); + input->event = HV_PARTITION_EVENT_ROOT_CRASHDUMP; + + cda->crashdump_action = HV_CRASHDUMP_ENTRY; + status = hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); + if (!hv_result_success(status)) + return; + + cda->crashdump_action = HV_CRASHDUMP_SUSPEND_ALL_VPS; + hv_do_hypercall(HVCALL_NOTIFY_PARTITION_EVENT, input, NULL); +} + +/* + * Common function for all cpus before devirtualization. + * + * Hypervisor crash: all cpus get here in NMI context. + * Linux crash: the panicing cpu gets here at base level, all others in NMI + * context. Note, panicing cpu may not be the BSP. + * + * The function is not inlined so it will show on the stack. It is named so + * because the crash cmd looks for certain well known function names on the + * stack before looking into the cpu saved note in the elf section, and + * that work is currently incomplete. + * + * Notes: + * Hypervisor crash: + * - the hypervisor is in a very restrictive mode at this point and any + * vmexit it cannot handle would result in reboot. So, no mumbo jumbo, + * just get to kexec as quickly as possible. + * + * Devirtualization is supported from the BSP only at present. + */ +static noinline __noclone void crash_nmi_callback(struct pt_regs *regs) +{ + struct hv_input_disable_hyp_ex *input; + u64 status; + int msecs = 1000, ccpu = smp_processor_id(); + + if (ccpu == 0) { + /* crash_save_cpu() will be done in the kexec path */ + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + } else { + crash_save_cpu(regs, ccpu); + cpu_emergency_stop_pt(); /* disable performance trace */ + atomic_inc(&crash_cpus_wait); + for (;;) + cpu_relax(); + } + + while (atomic_read(&crash_cpus_wait) < num_online_cpus() && msecs--) + mdelay(1); + + stop_nmi(); + if (!hv_has_crashed) + hv_notify_prepare_hyp(); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + hv_hvcrash_ctxt_save(); + hv_mark_tss_not_busy(); + hv_crash_fixup_kernpt(); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->rip = trampoline_pa; + input->arg = devirt_arg; + + status = hv_do_hypercall(HVCALL_DISABLE_HYP_EX, input, NULL); + + hv_panic_timeout_reboot(); +} + + +static DEFINE_SPINLOCK(hv_crash_reboot_lk); + +/* + * Generic NMI callback handler: could be called without any crash also. + * hv crash: hypervisor injects NMI's into all cpus + * lx crash: panicing cpu sends NMI to all but self via crash_stop_other_cpus + */ +static int hv_crash_nmi_local(unsigned int cmd, struct pt_regs *regs) +{ + if (!hv_has_crashed && hv_cda && hv_cda->cda_valid) + hv_has_crashed = true; + + if (!hv_has_crashed && !lx_has_crashed) + return NMI_DONE; /* ignore the NMI */ + + if (hv_has_crashed && !kexec_crash_loaded()) { + if (spin_trylock(&hv_crash_reboot_lk)) + hv_panic_timeout_reboot(); + else + for (;;) + cpu_relax(); + } + + crash_nmi_callback(regs); + + return NMI_DONE; +} + +/* + * hv_crash_stop_other_cpus() == smp_ops.crash_stop_other_cpus + * + * On normal Linux panic, this is called twice: first from panic and then again + * from native_machine_crash_shutdown. + * + * In case of hyperv, 3 ways to get here: + * 1. hv crash (only BSP will get here): + * BSP : NMI callback -> DisableHv -> hv_crash_asm32 -> hv_crash_c_entry + * -> __crash_kexec -> native_machine_crash_shutdown + * -> crash_smp_send_stop -> smp_ops.crash_stop_other_cpus + * Linux panic: + * 2. panic cpu x: panic() -> crash_smp_send_stop + * -> smp_ops.crash_stop_other_cpus + * 3. BSP: native_machine_crash_shutdown -> crash_smp_send_stop + * + * NB: noclone and non standard stack because of call to crash_setup_regs(). + */ +static void __noclone hv_crash_stop_other_cpus(void) +{ + static bool crash_stop_done; + struct pt_regs lregs; + int ccpu = smp_processor_id(); + + if (hv_has_crashed) + return; /* all cpus already in NMI handler path */ + + if (!kexec_crash_loaded()) { + hv_notify_prepare_hyp(); + hv_panic_timeout_reboot(); /* no return */ + } + + /* If the hv crashes also, we could come here again before cpus_stopped + * is set in crash_smp_send_stop(). So use our own check. + */ + if (crash_stop_done) + return; + crash_stop_done = true; + + /* Linux has crashed: hv is healthy, we can IPI safely */ + lx_has_crashed = true; + wmb(); /* NMI handlers look at lx_has_crashed */ + + apic->send_IPI_allbutself(NMI_VECTOR); + + if (crashing_cpu == -1) + crashing_cpu = ccpu; /* crash cmd uses this */ + + /* crash_setup_regs() happens in kexec also, but for the kexec cpu which + * is the BSP. We could be here on non-BSP cpu, collect regs if so. + */ + if (ccpu) + crash_setup_regs(&lregs, NULL); + + crash_nmi_callback(&lregs); +} +STACK_FRAME_NON_STANDARD(hv_crash_stop_other_cpus); + +/* This GDT is accessed in IA32-e compat mode which uses 32bits addresses */ +struct hv_gdtreg_32 { + u16 fill; + u16 limit; + u32 address; +} __packed; + +/* We need a CS with L bit to goto IA32-e long mode from 32bit compat mode */ +struct hv_crash_tramp_gdt { + u64 null; /* index 0, selector 0, null selector */ + u64 cs64; /* index 1, selector 8, cs64 selector */ +} __packed; + +/* No stack, so jump via far ptr in memory to load the 64bit CS */ +struct hv_cs_jmptgt { + u32 address; + u16 csval; + u16 fill; +} __packed; + +/* Linux use only, hypervisor doesn't look at this struct */ +struct hv_crash_tramp_data { + u64 tramp32_cr3; + u64 kernel_cr3; + struct hv_gdtreg_32 gdtr32; + struct hv_crash_tramp_gdt tramp_gdt; + struct hv_cs_jmptgt cs_jmptgt; + u64 c_entry_addr; +} __packed; + +/* + * Setup a temporary gdt to allow the asm code to switch to the long mode. + * Since the asm code is relocated/copied to a below 4G page, it cannot use rip + * relative addressing, hence we must use trampoline_pa here. Also, save other + * info like jmp and C entry targets for same reasons. + * + * Returns: 0 on success, -1 on error + */ +static int hv_crash_setup_trampdata(u64 trampoline_va) +{ + int size, offs; + void *dest; + struct hv_crash_tramp_data *tramp; + + /* These must match exactly the ones in the corresponding asm file */ + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, tramp32_cr3) != 0); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, kernel_cr3) != 8); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, gdtr32.limit) != 18); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, + cs_jmptgt.address) != 40); + BUILD_BUG_ON(offsetof(struct hv_crash_tramp_data, c_entry_addr) != 48); + + /* hv_crash_asm_end is beyond last byte by 1 */ + size = &hv_crash_asm_end - &hv_crash_asm32; + if (size + sizeof(struct hv_crash_tramp_data) > PAGE_SIZE) { + pr_err("%s: trampoline page overflow\n", __func__); + return -1; + } + + dest = (void *)trampoline_va; + memcpy(dest, &hv_crash_asm32, size); + + dest += size; + dest = (void *)round_up((ulong)dest, 16); + tramp = (struct hv_crash_tramp_data *)dest; + + /* see MAX_ASID_AVAILABLE in tlb.c: "PCID 0 is reserved for use by + * non-PCID-aware users". Build cr3 with pcid 0 + */ + tramp->tramp32_cr3 = __sme_pa(hv_crash_ptpgs[0]); + + /* Note, when restoring X86_CR4_PCIDE, cr3[11:0] must be zero */ + tramp->kernel_cr3 = __sme_pa(init_mm.pgd); + + tramp->gdtr32.limit = sizeof(struct hv_crash_tramp_gdt); + tramp->gdtr32.address = trampoline_pa + + (ulong)&tramp->tramp_gdt - trampoline_va; + + /* base:0 limit:0xfffff type:b dpl:0 P:1 L:1 D:0 avl:0 G:1 */ + tramp->tramp_gdt.cs64 = 0x00af9a000000ffff; + + tramp->cs_jmptgt.csval = 0x8; + offs = (ulong)&hv_crash_asm64 - (ulong)&hv_crash_asm32; + tramp->cs_jmptgt.address = trampoline_pa + offs; + + tramp->c_entry_addr = (u64)&hv_crash_c_entry; + + devirt_arg = trampoline_pa + (ulong)dest - trampoline_va; + + return 0; +} + +/* + * Build 32bit trampoline page table for transition from protected mode + * non-paging to long-mode paging. This transition needs pagetables below 4G. + */ +static void hv_crash_build_tramp_pt(void) +{ + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + u64 pa, addr = trampoline_pa; + + p4d = hv_crash_ptpgs[0] + pgd_index(addr) * sizeof(p4d); + pa = virt_to_phys(hv_crash_ptpgs[1]); + set_p4d(p4d, __p4d(_PAGE_TABLE | pa)); + p4d->p4d &= ~(_PAGE_NX); /* enable execute */ + + pud = hv_crash_ptpgs[1] + pud_index(addr) * sizeof(pud); + pa = virt_to_phys(hv_crash_ptpgs[2]); + set_pud(pud, __pud(_PAGE_TABLE | pa)); + + pmd = hv_crash_ptpgs[2] + pmd_index(addr) * sizeof(pmd); + pa = virt_to_phys(hv_crash_ptpgs[3]); + set_pmd(pmd, __pmd(_PAGE_TABLE | pa)); + + pte = hv_crash_ptpgs[3] + pte_index(addr) * sizeof(pte); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +/* + * Setup trampoline for devirtualization: + * - a page below 4G, ie 32bit addr containing asm glue code that hyp jmps to + * in protected mode. + * - 4 pages for a temporary page table that asm code uses to turn paging on + * - a temporary gdt to use in the compat mode. + * + * Returns: 0 on success + */ +static int hv_crash_trampoline_setup(void) +{ + int i, rc, order; + struct page *page; + u64 trampoline_va; + gfp_t flags32 = GFP_KERNEL | GFP_DMA32 | __GFP_ZERO; + + /* page for 32bit trampoline assembly code + hv_crash_tramp_data */ + page = alloc_page(flags32); + if (page == NULL) { + pr_err("%s: failed to alloc asm stub page\n", __func__); + return -1; + } + + trampoline_va = (u64)page_to_virt(page); + trampoline_pa = (u32)page_to_phys(page); + + order = 2; /* alloc 2^2 pages */ + page = alloc_pages(flags32, order); + if (page == NULL) { + pr_err("%s: failed to alloc pt pages\n", __func__); + free_page(trampoline_va); + return -1; + } + + for (i = 0; i < 4; i++, page++) + hv_crash_ptpgs[i] = page_to_virt(page); + + hv_crash_build_tramp_pt(); + + rc = hv_crash_setup_trampdata(trampoline_va); + if (rc) + goto errout; + + return 0; + +errout: + free_page(trampoline_va); + free_pages((ulong)hv_crash_ptpgs[0], order); + + return rc; +} + +/* Setup for kdump kexec to collect hypervisor RAM when running as root */ +void hv_root_crash_init(void) +{ + int rc; + struct hv_input_get_system_property *input; + struct hv_output_get_system_property *output; + unsigned long flags; + u64 status; + union hv_pfn_range cda_info; + + if (pgtable_l5_enabled()) { + pr_err("Hyper-V: crash dump not yet supported on 5level PTs\n"); + return; + } + + rc = register_nmi_handler(NMI_LOCAL, hv_crash_nmi_local, NMI_FLAG_FIRST, + "hv_crash_nmi"); + if (rc) { + pr_err("Hyper-V: failed to register crash nmi handler\n"); + return; + } + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + memset(input, 0, sizeof(*input)); + input->property_id = HV_SYSTEM_PROPERTY_CRASHDUMPAREA; + + status = hv_do_hypercall(HVCALL_GET_SYSTEM_PROPERTY, input, output); + cda_info.as_uint64 = output->hv_cda_info.as_uint64; + local_irq_restore(flags); + + if (!hv_result_success(status)) { + pr_err("Hyper-V: %s: property:%d %s\n", __func__, + input->property_id, hv_result_to_string(status)); + goto err_out; + } + + if (cda_info.base_pfn == 0) { + pr_err("Hyper-V: hypervisor crash dump area pfn is 0\n"); + goto err_out; + } + + hv_cda = phys_to_virt(cda_info.base_pfn << HV_HYP_PAGE_SHIFT); + + rc = hv_crash_trampoline_setup(); + if (rc) + goto err_out; + + smp_ops.crash_stop_other_cpus = hv_crash_stop_other_cpus; + + crash_kexec_post_notifiers = true; + hv_crash_enabled = true; + pr_info("Hyper-V: both linux and hypervisor kdump support enabled\n"); + + return; + +err_out: + unregister_nmi_handler(NMI_LOCAL, "hv_crash_nmi"); + pr_err("Hyper-V: only linux root kdump support enabled\n"); +} diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 7abb09e2eeb8..14de43f4bc6c 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -1,135 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * X86 specific Hyper-V initialization code. * * Copyright (C) 2016, Microsoft, Inc. * * Author : K. Y. Srinivasan <kys@microsoft.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * */ +#define pr_fmt(fmt) "Hyper-V: " fmt + #include <linux/efi.h> #include <linux/types.h> +#include <linux/bitfield.h> +#include <linux/io.h> #include <asm/apic.h> #include <asm/desc.h> +#include <asm/e820/api.h> +#include <asm/sev.h> #include <asm/hypervisor.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/idtentry.h> +#include <asm/set_memory.h> +#include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/clockchips.h> -#include <linux/hyperv.h> #include <linux/slab.h> +#include <linux/kernel.h> #include <linux/cpuhotplug.h> +#include <linux/syscore_ops.h> +#include <clocksource/hyperv_timer.h> +#include <linux/highmem.h> +#include <linux/export.h> -#ifdef CONFIG_HYPERV_TSCPAGE - -static struct ms_hyperv_tsc_page *tsc_pg; +void *hv_hypercall_pg; -struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +#ifdef CONFIG_X86_64 +static u64 __hv_hyperfail(u64 control, u64 param1, u64 param2) { - return tsc_pg; + return U64_MAX; } -EXPORT_SYMBOL_GPL(hv_get_tsc_page); -static u64 read_hv_clock_tsc(struct clocksource *arg) +DEFINE_STATIC_CALL(__hv_hypercall, __hv_hyperfail); + +u64 hv_std_hypercall(u64 control, u64 param1, u64 param2) { - u64 current_tick = hv_read_tsc_page(tsc_pg); + u64 hv_status; - if (current_tick == U64_MAX) - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + register u64 __r8 asm("r8") = param2; + asm volatile ("call " STATIC_CALL_TRAMP_STR(__hv_hypercall) + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); - return current_tick; + return hv_status; } -static struct clocksource hyperv_cs_tsc = { - .name = "hyperv_clocksource_tsc_page", - .rating = 400, - .read = read_hv_clock_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; -#endif +typedef u64 (*hv_hypercall_f)(u64 control, u64 param1, u64 param2); -static u64 read_hv_clock_msr(struct clocksource *arg) +static inline void hv_set_hypercall_pg(void *ptr) { - u64 current_tick; - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - return current_tick; -} - -static struct clocksource hyperv_cs_msr = { - .name = "hyperv_clocksource_msr", - .rating = 400, - .read = read_hv_clock_msr, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; + hv_hypercall_pg = ptr; -void *hv_hypercall_pg; + if (!ptr) + ptr = &__hv_hyperfail; + static_call_update(__hv_hypercall, (hv_hypercall_f)ptr); +} +#else +static inline void hv_set_hypercall_pg(void *ptr) +{ + hv_hypercall_pg = ptr; +} EXPORT_SYMBOL_GPL(hv_hypercall_pg); -struct clocksource *hyperv_cs; -EXPORT_SYMBOL_GPL(hyperv_cs); +#endif + +union hv_ghcb * __percpu *hv_ghcb_pg; -u32 *hv_vp_index; -EXPORT_SYMBOL_GPL(hv_vp_index); +/* Storage to save the hypercall page temporarily for hibernation */ +static void *hv_hypercall_pg_saved; struct hv_vp_assist_page **hv_vp_assist_page; EXPORT_SYMBOL_GPL(hv_vp_assist_page); -void __percpu **hyperv_pcpu_input_arg; -EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); +static int hyperv_init_ghcb(void) +{ + u64 ghcb_gpa; + void *ghcb_va; + void **ghcb_base; -u32 hv_max_vp_index; + if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) + return 0; -static int hv_cpu_init(unsigned int cpu) -{ - u64 msr_vp_index; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()]; - void **input_arg; + if (!hv_ghcb_pg) + return -EINVAL; - input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - *input_arg = page_address(alloc_page(GFP_KERNEL)); + /* + * GHCB page is allocated by paravisor. The address + * returned by MSR_AMD64_SEV_ES_GHCB is above shared + * memory boundary and map it here. + */ + rdmsrq(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa); - hv_get_vp_index(msr_vp_index); + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary; + ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE); + if (!ghcb_va) + return -ENOMEM; - hv_vp_index[smp_processor_id()] = msr_vp_index; + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + *ghcb_base = ghcb_va; - if (msr_vp_index > hv_max_vp_index) - hv_max_vp_index = msr_vp_index; + return 0; +} - if (!hv_vp_assist_page) - return 0; +static int hv_cpu_init(unsigned int cpu) +{ + union hv_vp_assist_msr_contents msr = { 0 }; + struct hv_vp_assist_page **hvp; + int ret; - if (!*hvp) - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + ret = hv_common_cpu_init(cpu); + if (ret) + return ret; - if (*hvp) { - u64 val; + if (!hv_vp_assist_page) + return 0; - val = vmalloc_to_pfn(*hvp); - val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) | - HV_X64_MSR_VP_ASSIST_PAGE_ENABLE; + hvp = &hv_vp_assist_page[cpu]; + if (hv_root_partition()) { + /* + * For root partition we get the hypervisor provided VP assist + * page, instead of allocating a new page. + */ + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + *hvp = memremap(msr.pfn << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT, + PAGE_SIZE, MEMREMAP_WB); + } else { + /* + * The VP assist page is an "overlay" page (see Hyper-V TLFS's + * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed + * out to make sure we always write the EOI MSR in + * hv_apic_eoi_write() *after* the EOI optimization is disabled + * in hv_cpu_die(), otherwise a CPU may not be stopped in the + * case of CPU offlining and the VM will hang. + */ + if (!*hvp) { + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + + /* + * Hyper-V should never specify a VM that is a Confidential + * VM and also running in the root partition. Root partition + * is blocked to run in Confidential VM. So only decrypt assist + * page in non-root partition here. + */ + if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); + memset(*hvp, 0, PAGE_SIZE); + } + } + + if (*hvp) + msr.pfn = vmalloc_to_pfn(*hvp); - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val); + } + if (!WARN_ON(!(*hvp))) { + msr.enable = 1; + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); } - return 0; + /* Allow Hyper-V stimer vector to be injected from Hypervisor. */ + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, true); + + return hyperv_init_ghcb(); } static void (*hv_reenlightenment_cb)(void); @@ -138,7 +183,7 @@ static void hv_reenlightenment_notify(struct work_struct *dummy) { struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); /* Don't issue the callback if TSC accesses are not emulated */ if (hv_reenlightenment_cb && emu_status.inprogress) @@ -151,11 +196,11 @@ void hyperv_stop_tsc_emulation(void) u64 freq; struct hv_tsc_emulation_status emu_status; - rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + rdmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); emu_status.inprogress = 0; - wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + wrmsrq(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); tsc_khz = div64_u64(freq, 1000); } EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); @@ -163,23 +208,19 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); static inline bool hv_reenlightenment_available(void) { /* - * Check for required features and priviliges to make TSC frequency + * Check for required features and privileges to make TSC frequency * change notifications work. */ - return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS && ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && - ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; + ms_hyperv.features & HV_ACCESS_REENLIGHTENMENT; } -__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_reenlightenment) { - entering_ack_irq(); - + apic_eoi(); inc_irq_stat(irq_hv_reenlightenment_count); - schedule_delayed_work(&hv_reenlightenment_work, HZ/10); - - exiting_irq(); } void set_hv_tscchange_cb(void (*cb)(void)) @@ -187,22 +228,28 @@ void set_hv_tscchange_cb(void (*cb)(void)) struct hv_reenlightenment_control re_ctrl = { .vector = HYPERV_REENLIGHTENMENT_VECTOR, .enabled = 1, - .target_vp = hv_vp_index[smp_processor_id()] }; struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; if (!hv_reenlightenment_available()) { - pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + pr_warn("reenlightenment support is unavailable\n"); return; } + if (!hv_vp_index) + return; + hv_reenlightenment_cb = cb; /* Make sure callback is registered before we write to MSRs */ wmb(); - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); - wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + re_ctrl.target_vp = hv_vp_index[get_cpu()]; + + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrq(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); + + put_cpu(); } EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); @@ -213,9 +260,9 @@ void clear_hv_tscchange_cb(void) if (!hv_reenlightenment_available()) return; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); re_ctrl.enabled = 0; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); hv_reenlightenment_cb = NULL; } @@ -225,30 +272,55 @@ static int hv_cpu_die(unsigned int cpu) { struct hv_reenlightenment_control re_ctrl; unsigned int new_cpu; - unsigned long flags; - void **input_arg; - void *input_pg = NULL; + void **ghcb_va; - local_irq_save(flags); - input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg); - input_pg = *input_arg; - *input_arg = NULL; - local_irq_restore(flags); - free_page((unsigned long)input_pg); + if (hv_ghcb_pg) { + ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg); + if (*ghcb_va) + iounmap(*ghcb_va); + *ghcb_va = NULL; + } - if (hv_vp_assist_page && hv_vp_assist_page[cpu]) - wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0); + if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) + apic_update_vector(cpu, HYPERV_STIMER0_VECTOR, false); + + hv_common_cpu_die(cpu); + + if (hv_vp_assist_page && hv_vp_assist_page[cpu]) { + union hv_vp_assist_msr_contents msr = { 0 }; + if (hv_root_partition()) { + /* + * For root partition the VP assist page is mapped to + * hypervisor provided page, and thus we unmap the + * page here and nullify it, so that in future we have + * correct page address mapped in hv_cpu_init. + */ + memunmap(hv_vp_assist_page[cpu]); + hv_vp_assist_page[cpu] = NULL; + rdmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + msr.enable = 0; + } + wrmsrq(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64); + } if (hv_reenlightenment_cb == NULL) return 0; - rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + rdmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); if (re_ctrl.target_vp == hv_vp_index[cpu]) { - /* Reassign to some other online CPU */ + /* + * Reassign reenlightenment notifications to some other online + * CPU or just disable the feature if there are no online CPUs + * left (happens on hibernation). + */ new_cpu = cpumask_any_but(cpu_online_mask, cpu); - re_ctrl.target_vp = hv_vp_index[new_cpu]; - wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (new_cpu < nr_cpu_ids) + re_ctrl.target_vp = hv_vp_index[new_cpu]; + else + re_ctrl.enabled = 0; + + wrmsrq(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); } return 0; @@ -256,20 +328,118 @@ static int hv_cpu_die(unsigned int cpu) static int __init hv_pci_init(void) { - int gen2vm = efi_enabled(EFI_BOOT); + bool gen2vm = efi_enabled(EFI_BOOT); /* - * For Generation-2 VM, we exit from pci_arch_init() by returning 0. - * The purpose is to suppress the harmless warning: + * A Generation-2 VM doesn't support legacy PCI/PCIe, so both + * raw_pci_ops and raw_pci_ext_ops are NULL, and pci_subsys_init() -> + * pcibios_init() doesn't call pcibios_resource_survey() -> + * e820__reserve_resources_late(); as a result, any emulated persistent + * memory of E820_TYPE_PRAM (12) via the kernel parameter + * memmap=nn[KMG]!ss is not added into iomem_resource and hence can't be + * detected by register_e820_pmem(). Fix this by directly calling + * e820__reserve_resources_late() here: e820__reserve_resources_late() + * depends on e820__reserve_resources(), which has been called earlier + * from setup_arch(). Note: e820__reserve_resources_late() also adds + * any memory of E820_TYPE_PMEM (7) into iomem_resource, and + * acpi_nfit_register_region() -> acpi_nfit_insert_resource() -> + * region_intersects() returns REGION_INTERSECTS, so the memory of + * E820_TYPE_PMEM won't get added twice. + * + * We return 0 here so that pci_arch_init() won't print the warning: * "PCI: Fatal: No config space access function found" */ - if (gen2vm) + if (gen2vm) { + e820__reserve_resources_late(); return 0; + } /* For Generation-1 VM, we'll proceed in pci_arch_init(). */ return 1; } +static int hv_suspend(void *data) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + if (hv_root_partition()) + return -EPERM; + + /* + * Reset the hypercall page as it is going to be invalidated + * across hibernation. Setting hv_hypercall_pg to NULL ensures + * that any subsequent hypercall operation fails safely instead of + * crashing due to an access of an invalid page. The hypercall page + * pointer is restored on resume. + */ + hv_hypercall_pg_saved = hv_hypercall_pg; + hv_set_hypercall_pg(NULL); + + /* Disable the hypercall page in the hypervisor */ + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 0; + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + ret = hv_cpu_die(0); + return ret; +} + +static void hv_resume(void *data) +{ + union hv_x64_msr_hypercall_contents hypercall_msr; + int ret; + + ret = hv_cpu_init(0); + WARN_ON(ret); + + /* Re-enable the hypercall page */ + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.enable = 1; + hypercall_msr.guest_physical_address = + vmalloc_to_pfn(hv_hypercall_pg_saved); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + hv_set_hypercall_pg(hv_hypercall_pg_saved); + hv_hypercall_pg_saved = NULL; + + /* + * Reenlightenment notifications are disabled by hv_cpu_die(0), + * reenable them here if hv_reenlightenment_cb was previously set. + */ + if (hv_reenlightenment_cb) + set_hv_tscchange_cb(hv_reenlightenment_cb); +} + +/* Note: when the ops are called, only CPU0 is online and IRQs are disabled. */ +static const struct syscore_ops hv_syscore_ops = { + .suspend = hv_suspend, + .resume = hv_resume, +}; + +static struct syscore hv_syscore = { + .ops = &hv_syscore_ops, +}; + +static void (* __initdata old_setup_percpu_clockev)(void); + +static void __init hv_stimer_setup_percpu_clockev(void) +{ + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(false); + + /* + * Still register the LAPIC timer, because the direct-mode STIMER is + * not supported by old versions of Hyper-V. This also allows users + * to switch to LAPIC timer via /sys, if they want to. + */ + if (old_setup_percpu_clockev) + old_setup_percpu_clockev(); +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -280,120 +450,174 @@ static int __init hv_pci_init(void) */ void __init hyperv_init(void) { - u64 guest_id, required_msrs; + u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; - int cpuhp, i; + int cpuhp; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; - /* Absolutely required MSRs */ - required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | - HV_X64_MSR_VP_INDEX_AVAILABLE; - - if ((ms_hyperv.features & required_msrs) != required_msrs) + if (hv_common_init()) return; /* - * Allocate the per-CPU state for the hypercall input arg. - * If this allocation fails, we will not be able to setup - * (per-CPU) hypercall input page and thus this failure is - * fatal on Hyper-V. + * The VP assist page is useless to a TDX guest: the only use we + * would have for it is lazy EOI, which can not be used with TDX. */ - hyperv_pcpu_input_arg = alloc_percpu(void *); - - BUG_ON(hyperv_pcpu_input_arg == NULL); + if (hv_isolation_type_tdx()) + hv_vp_assist_page = NULL; + else + hv_vp_assist_page = kcalloc(nr_cpu_ids, + sizeof(*hv_vp_assist_page), + GFP_KERNEL); + if (!hv_vp_assist_page) { + ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - /* Allocate percpu VP index */ - hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), - GFP_KERNEL); - if (!hv_vp_index) - return; + if (!hv_isolation_type_tdx()) + goto common_free; + } - for (i = 0; i < num_possible_cpus(); i++) - hv_vp_index[i] = VP_INVAL; + if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + /* Negotiate GHCB Version. */ + if (!hv_ghcb_negotiate_protocol()) + hv_ghcb_terminate(SEV_TERM_SET_GEN, + GHCB_SEV_ES_PROT_UNSUPPORTED); - hv_vp_assist_page = kcalloc(num_possible_cpus(), - sizeof(*hv_vp_assist_page), GFP_KERNEL); - if (!hv_vp_assist_page) { - ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - goto free_vp_index; + hv_ghcb_pg = alloc_percpu(union hv_ghcb *); + if (!hv_ghcb_pg) + goto free_vp_assist_page; } - cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + cpuhp = cpuhp_setup_state(CPUHP_AP_HYPERV_ONLINE, "x86/hyperv_init:online", hv_cpu_init, hv_cpu_die); if (cpuhp < 0) - goto free_vp_assist_page; + goto free_ghcb_page; /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page + * + * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: + * when the hypercall input is a page, such a VM must pass a decrypted + * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page + * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. + * + * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, + * which are handled by the paravisor and the VM must use an encrypted + * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and + * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and + * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: + * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). + * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. + * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; + * instead, hv_post_message() uses the post_msg_page, which is decrypted + * in such a VM and is only used in such a VM. */ - guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); - wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hv_hypercall_pg == NULL) { - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - goto remove_cpuhp_state; - } + /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + goto skip_hypercall_pg_init; + + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_ROX, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); + if (hv_hypercall_pg == NULL) + goto clean_guest_os_id; + + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); - hv_apic_init(); + if (hv_root_partition()) { + struct page *pg; + void *src; + + /* + * For the root partition, the hypervisor will set up its + * hypercall page. The hypervisor guarantees it will not show + * up in the root's address space. The root can't change the + * location of the hypercall page. + * + * Order is important here. We must enable the hypercall page + * so it is populated with code, then copy the code to an + * executable page. + */ + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + + pg = vmalloc_to_page(hv_hypercall_pg); + src = memremap(hypercall_msr.guest_physical_address << PAGE_SHIFT, PAGE_SIZE, + MEMREMAP_WB); + BUG_ON(!src); + memcpy_to_page(pg, 0, src, HV_HYP_PAGE_SIZE); + memunmap(src); + + hv_remap_tsc_clocksource(); + hv_root_crash_init(); + hv_sleep_notifiers_register(); + } else { + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); + wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + } - x86_init.pci.arch_init = hv_pci_init; + hv_set_hypercall_pg(hv_hypercall_pg); +skip_hypercall_pg_init: /* - * Register Hyper-V specific clocksource. + * hyperv_init() is called before LAPIC is initialized: see + * apic_intr_mode_init() -> x86_platform.apic_post_init() and + * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER + * depends on LAPIC, so hv_stimer_alloc() should be called from + * x86_init.timers.setup_percpu_clockev. */ -#ifdef CONFIG_HYPERV_TSCPAGE - if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { - union hv_x64_msr_hypercall_contents tsc_msr; - - tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); - if (!tsc_pg) - goto register_msr_cs; + old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; + x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; - hyperv_cs = &hyperv_cs_tsc; - - rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + hv_apic_init(); - tsc_msr.enable = 1; - tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); + x86_init.pci.arch_init = hv_pci_init; - wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + register_syscore(&hv_syscore); - hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; + if (ms_hyperv.priv_high & HV_ACCESS_PARTITION_ID) + hv_get_partition_id(); - clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); - return; - } -register_msr_cs: -#endif +#ifdef CONFIG_PCI_MSI /* - * For 32 bit guests just use the MSR based mechanism for reading - * the partition counter. + * If we're running as root, we want to create our own PCI MSI domain. + * We can't set this in hv_pci_init because that would be too late. */ + if (hv_root_partition()) + x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain; +#endif - hyperv_cs = &hyperv_cs_msr; - if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) - clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + /* Query the VMs extended capability once, so that it can be cached. */ + hv_query_ext_cap(0); + + /* Find the VTL */ + ms_hyperv.vtl = get_vtl(); + + if (ms_hyperv.vtl > 0) /* non default VTL */ + hv_vtl_early_init(); return; -remove_cpuhp_state: - cpuhp_remove_state(cpuhp); +clean_guest_os_id: + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + cpuhp_remove_state(CPUHP_AP_HYPERV_ONLINE); +free_ghcb_page: + free_percpu(hv_ghcb_pg); free_vp_assist_page: kfree(hv_vp_assist_page); hv_vp_assist_page = NULL; -free_vp_index: - kfree(hv_vp_index); - hv_vp_index = NULL; +common_free: + hv_common_free(); } /* @@ -402,25 +626,38 @@ free_vp_index: void hyperv_cleanup(void) { union hv_x64_msr_hypercall_contents hypercall_msr; + union hv_reference_tsc_msr tsc_msr; /* Reset our OS id */ - wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + wrmsrq(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + + /* + * Reset hypercall page reference before reset the page, + * let hypercall operations fail safely rather than + * panic the kernel for using invalid hypercall page + */ + hv_hypercall_pg = NULL; /* Reset the hypercall page */ - hypercall_msr.as_uint64 = 0; - wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hypercall_msr.as_uint64 = hv_get_msr(HV_X64_MSR_HYPERCALL); + hypercall_msr.enable = 0; + hv_set_msr(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* Reset the TSC page */ - hypercall_msr.as_uint64 = 0; - wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64); + tsc_msr.as_uint64 = hv_get_msr(HV_X64_MSR_REFERENCE_TSC); + tsc_msr.enable = 0; + hv_set_msr(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); } -EXPORT_SYMBOL_GPL(hyperv_cleanup); -void hyperv_report_panic(struct pt_regs *regs, long err) +void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die) { static bool panic_reported; u64 guest_id; + if (in_die && !panic_on_oops) + return; + /* * We prefer to report panic on 'die' chain as we have proper * registers to report, but if we miss it (e.g. on BUG()) we need @@ -430,48 +667,21 @@ void hyperv_report_panic(struct pt_regs *regs, long err) return; panic_reported = true; - rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); + rdmsrq(HV_X64_MSR_GUEST_OS_ID, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P0, err); - wrmsrl(HV_X64_MSR_CRASH_P1, guest_id); - wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip); - wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax); - wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp); + wrmsrq(HV_X64_MSR_CRASH_P0, err); + wrmsrq(HV_X64_MSR_CRASH_P1, guest_id); + wrmsrq(HV_X64_MSR_CRASH_P2, regs->ip); + wrmsrq(HV_X64_MSR_CRASH_P3, regs->ax); + wrmsrq(HV_X64_MSR_CRASH_P4, regs->sp); /* * Let Hyper-V know there is crash data available */ - wrmsrl(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); + wrmsrq(HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_CRASH_NOTIFY); } EXPORT_SYMBOL_GPL(hyperv_report_panic); -/** - * hyperv_report_panic_msg - report panic message to Hyper-V - * @pa: physical address of the panic page containing the message - * @size: size of the message in the page - */ -void hyperv_report_panic_msg(phys_addr_t pa, size_t size) -{ - /* - * P3 to contain the physical address of the panic page & P4 to - * contain the size of the panic data in that page. Rest of the - * registers are no-op when the NOTIFY_MSG flag is set. - */ - wrmsrl(HV_X64_MSR_CRASH_P0, 0); - wrmsrl(HV_X64_MSR_CRASH_P1, 0); - wrmsrl(HV_X64_MSR_CRASH_P2, 0); - wrmsrl(HV_X64_MSR_CRASH_P3, pa); - wrmsrl(HV_X64_MSR_CRASH_P4, size); - - /* - * Let Hyper-V know there is crash data available along with - * the panic message. - */ - wrmsrl(HV_X64_MSR_CRASH_CTL, - (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); -} -EXPORT_SYMBOL_GPL(hyperv_report_panic_msg); - bool hv_is_hyperv_initialized(void) { union hv_x64_msr_hypercall_contents hypercall_msr; @@ -483,13 +693,49 @@ bool hv_is_hyperv_initialized(void) if (x86_hyper_type != X86_HYPER_MS_HYPERV) return false; + /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + return true; /* * Verify that earlier initialization succeeded by checking * that the hypercall page is setup */ hypercall_msr.as_uint64 = 0; - rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); return hypercall_msr.enable; } EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized); + +int hv_apicid_to_vp_index(u32 apic_id) +{ + u64 control; + u64 status; + unsigned long irq_flags; + struct hv_get_vp_from_apic_id_in *input; + u32 *output, ret; + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + input->partition_id = HV_PARTITION_ID_SELF; + input->apic_ids[0] = apic_id; + + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_INDEX_FROM_APIC_ID; + status = hv_do_hypercall(control, input, output); + ret = output[0]; + + local_irq_restore(irq_flags); + + if (!hv_result_success(status)) { + pr_err("failed to get vp index from apic id %d, status %#llx\n", + apic_id, status); + return -EINVAL; + } + + return ret; +} +EXPORT_SYMBOL_GPL(hv_apicid_to_vp_index); diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index a861b0456b1a..81b006601370 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -15,17 +15,17 @@ #include <asm/mshyperv.h> #include <asm/paravirt.h> #include <asm/apic.h> +#include <asm/msr.h> -static bool __initdata hv_pvspin = true; +static bool hv_pvspin __initdata = true; static void hv_qlock_kick(int cpu) { - apic->send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); + __apic_send_IPI(cpu, X86_PLATFORM_IPI_VECTOR); } static void hv_qlock_wait(u8 *byte, u8 val) { - unsigned long msr_val; unsigned long flags; if (in_nmi()) @@ -40,33 +40,39 @@ static void hv_qlock_wait(u8 *byte, u8 val) * To prevent a race against the unlock path it is required to * disable interrupts before accessing the HV_X64_MSR_GUEST_IDLE * MSR. Otherwise, if the IPI from hv_qlock_kick() arrives between - * the lock value check and the rdmsrl() then the vCPU might be put + * the lock value check and the rdmsrq() then the vCPU might be put * into 'idle' state by the hypervisor and kept in that state for * an unspecified amount of time. */ local_irq_save(flags); /* - * Only issue the rdmsrl() when the lock state has not changed. + * Only issue the rdmsrq() when the lock state has not changed. */ - if (READ_ONCE(*byte) == val) - rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val); + if (READ_ONCE(*byte) == val) { + unsigned long msr_val; + + rdmsrq(HV_X64_MSR_GUEST_IDLE, msr_val); + + (void)msr_val; + } local_irq_restore(flags); } /* * Hyper-V does not support this so far. */ -bool hv_vcpu_is_preempted(int vcpu) +__visible bool hv_vcpu_is_preempted(int vcpu) { return false; } + PV_CALLEE_SAVE_REGS_THUNK(hv_vcpu_is_preempted); void __init hv_init_spinlocks(void) { if (!hv_pvspin || !apic || !(ms_hyperv.hints & HV_X64_CLUSTER_IPI_RECOMMENDED) || - !(ms_hyperv.features & HV_X64_MSR_GUEST_IDLE_AVAILABLE)) { + !(ms_hyperv.features & HV_MSR_GUEST_IDLE_AVAILABLE)) { pr_info("PV spinlocks disabled\n"); return; } diff --git a/arch/x86/hyperv/hv_trampoline.S b/arch/x86/hyperv/hv_trampoline.S new file mode 100644 index 000000000000..25f02ff12286 --- /dev/null +++ b/arch/x86/hyperv/hv_trampoline.S @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * X86 specific Hyper-V kdump/crash related code. + * + * Copyright (C) 2025, Microsoft, Inc. + * + */ +#include <linux/linkage.h> +#include <asm/alternative.h> +#include <asm/msr.h> +#include <asm/processor-flags.h> +#include <asm/nospec-branch.h> + +/* + * void noreturn hv_crash_asm32(arg1) + * arg1 == edi == 32bit PA of struct hv_crash_tramp_data + * + * The hypervisor jumps here upon devirtualization in protected mode. This + * code gets copied to a page in the low 4G ie, 32bit space so it can run + * in the protected mode. Hence we cannot use any compile/link time offsets or + * addresses. It restores long mode via temporary gdt and page tables and + * eventually jumps to kernel code entry at HV_CRASHDATA_OFFS_C_entry. + * + * PreCondition (ie, Hypervisor call back ABI): + * o CR0 is set to 0x0021: PE(prot mode) and NE are set, paging is disabled + * o CR4 is set to 0x0 + * o IA32_EFER is set to 0x901 (SCE and NXE are set) + * o EDI is set to the Arg passed to HVCALL_DISABLE_HYP_EX. + * o CS, DS, ES, FS, GS are all initialized with a base of 0 and limit 0xFFFF + * o IDTR, TR and GDTR are initialized with a base of 0 and limit of 0xFFFF + * o LDTR is initialized as invalid (limit of 0) + * o MSR PAT is power on default. + * o Other state/registers are cleared. All TLBs flushed. + */ + +#define HV_CRASHDATA_OFFS_TRAMPCR3 0x0 /* 0 */ +#define HV_CRASHDATA_OFFS_KERNCR3 0x8 /* 8 */ +#define HV_CRASHDATA_OFFS_GDTRLIMIT 0x12 /* 18 */ +#define HV_CRASHDATA_OFFS_CS_JMPTGT 0x28 /* 40 */ +#define HV_CRASHDATA_OFFS_C_entry 0x30 /* 48 */ + + .text + .code32 + +SYM_CODE_START(hv_crash_asm32) + UNWIND_HINT_UNDEFINED + ENDBR + movl $X86_CR4_PAE, %ecx + movl %ecx, %cr4 + + movl %edi, %ebx + add $HV_CRASHDATA_OFFS_TRAMPCR3, %ebx + movl %cs:(%ebx), %eax + movl %eax, %cr3 + + /* Setup EFER for long mode now */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Turn paging on using the temp 32bit trampoline page table */ + movl %cr0, %eax + orl $(X86_CR0_PG), %eax + movl %eax, %cr0 + + /* since kernel cr3 could be above 4G, we need to be in the long mode + * before we can load 64bits of the kernel cr3. We use a temp gdt for + * that with CS.L=1 and CS.D=0 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_GDTRLIMIT, %eax + lgdtl %cs:(%eax) + + /* not done yet, restore CS now to switch to CS.L=1 */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_CS_JMPTGT, %eax + ljmp %cs:*(%eax) +SYM_CODE_END(hv_crash_asm32) + + /* we now run in full 64bit IA32-e long mode, CS.L=1 and CS.D=0 */ + .code64 + .balign 8 +SYM_CODE_START(hv_crash_asm64) + UNWIND_HINT_UNDEFINED + ENDBR + /* restore kernel page tables so we can jump to kernel code */ + mov %edi, %eax + add $HV_CRASHDATA_OFFS_KERNCR3, %eax + movq %cs:(%eax), %rbx + movq %rbx, %cr3 + + mov %edi, %eax + add $HV_CRASHDATA_OFFS_C_entry, %eax + movq %cs:(%eax), %rbx + ANNOTATE_RETPOLINE_SAFE + jmp *%rbx + + int $3 + +SYM_INNER_LABEL(hv_crash_asm_end, SYM_L_GLOBAL) +SYM_CODE_END(hv_crash_asm64) diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c new file mode 100644 index 000000000000..c0edaed0efb3 --- /dev/null +++ b/arch/x86/hyperv/hv_vtl.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023, Microsoft Corporation. + * + * Author: + * Saurabh Sengar <ssengar@microsoft.com> + */ + +#include <asm/apic.h> +#include <asm/boot.h> +#include <asm/desc.h> +#include <asm/fpu/api.h> +#include <asm/fpu/types.h> +#include <asm/i8259.h> +#include <asm/mshyperv.h> +#include <asm/msr.h> +#include <asm/realmode.h> +#include <asm/reboot.h> +#include <asm/smap.h> +#include <linux/export.h> +#include <../kernel/smpboot.h> +#include "../../kernel/fpu/legacy.h" + +extern struct boot_params boot_params; +static struct real_mode_header hv_vtl_real_mode_header; + +static bool __init hv_vtl_msi_ext_dest_id(void) +{ + return true; +} + +/* + * The `native_machine_emergency_restart` function from `reboot.c` writes + * to the physical address 0x472 to indicate the type of reboot for the + * firmware. We cannot have that in VSM as the memory composition might + * be more generic, and such write effectively corrupts the memory thus + * making diagnostics harder at the very least. + */ +static void __noreturn hv_vtl_emergency_restart(void) +{ + /* + * Cause a triple fault and the immediate reset. Here the code does not run + * on the top of any firmware, whereby cannot reach out to its services. + * The inifinite loop is for the improbable case that the triple fault does + * not work and have to preserve the state intact for debugging. + */ + for (;;) { + idt_invalidate(); + __asm__ __volatile__("int3"); + } +} + +/* + * The only way to restart in the VTL mode is to triple fault as the kernel runs + * as firmware. + */ +static void __noreturn hv_vtl_restart(char __maybe_unused *cmd) +{ + hv_vtl_emergency_restart(); +} + +void __init hv_vtl_init_platform(void) +{ + /* + * This function is a no-op if the VTL mode is not enabled. + * If it is, this function runs if and only the kernel boots in + * VTL2 which the x86 hv initialization path makes sure of. + */ + pr_info("Linux runs in Hyper-V Virtual Trust Level %d\n", ms_hyperv.vtl); + + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; + x86_init.timers.timer_init = x86_init_noop; + x86_init.resources.probe_roms = x86_init_noop; + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + x86_platform.get_nmi_reason = hv_get_nmi_reason; + + x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT; + x86_platform.legacy.rtc = 0; + x86_platform.legacy.warm_reset = 0; + x86_platform.legacy.reserve_bios_regions = 0; + x86_platform.legacy.devices.pnpbios = 0; + + x86_init.hyper.msi_ext_dest_id = hv_vtl_msi_ext_dest_id; +} + +static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc) +{ + return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) | + (desc->base1 << 16) | desc->base0; +} + +static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc) +{ + return ((u32)desc->limit1 << 16) | (u32)desc->limit0; +} + +typedef void (*secondary_startup_64_fn)(void*, void*); +static void hv_vtl_ap_entry(void) +{ + ((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params); +} + +static int hv_vtl_bringup_vcpu(u32 target_vp_index, int cpu, u64 eip_ignored) +{ + u64 status; + int ret = 0; + struct hv_enable_vp_vtl *input; + unsigned long irq_flags; + + struct desc_ptr gdt_ptr; + struct desc_ptr idt_ptr; + + struct ldttss_desc *tss; + struct ldttss_desc *ldt; + struct desc_struct *gdt; + + struct task_struct *idle = idle_thread_get(cpu); + u64 rsp = (unsigned long)idle->thread.sp; + + u64 rip = (u64)&hv_vtl_ap_entry; + + native_store_gdt(&gdt_ptr); + store_idt(&idt_ptr); + + gdt = (struct desc_struct *)((void *)(gdt_ptr.address)); + tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS); + ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT); + + local_irq_save(irq_flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + memset(input, 0, sizeof(*input)); + + input->partition_id = HV_PARTITION_ID_SELF; + input->vp_index = target_vp_index; + input->target_vtl.target_vtl = HV_VTL_MGMT; + + /* + * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit + * mode transition sequence after waking up an AP with SIPI whose + * vector points to the 16-bit AP startup trampoline code. Here in + * VTL2, we can't perform that sequence as the AP has to start in + * the 64-bit mode. + * + * To make this happen, we tell the hypervisor to load a valid 64-bit + * context (most of which is just magic numbers from the CPU manual) + * so that AP jumps right to the 64-bit entry of the kernel, and the + * control registers are loaded with values that let the AP fetch the + * code and data and carry on with work it gets assigned. + */ + + input->vp_context.rip = rip; + input->vp_context.rsp = rsp; + input->vp_context.rflags = 0x0000000000000002; + input->vp_context.efer = native_rdmsrq(MSR_EFER); + input->vp_context.cr0 = native_read_cr0(); + input->vp_context.cr3 = __native_read_cr3(); + input->vp_context.cr4 = native_read_cr4(); + input->vp_context.msr_cr_pat = native_rdmsrq(MSR_IA32_CR_PAT); + input->vp_context.idtr.limit = idt_ptr.size; + input->vp_context.idtr.base = idt_ptr.address; + input->vp_context.gdtr.limit = gdt_ptr.size; + input->vp_context.gdtr.base = gdt_ptr.address; + + /* Non-system desc (64bit), long, code, present */ + input->vp_context.cs.selector = __KERNEL_CS; + input->vp_context.cs.base = 0; + input->vp_context.cs.limit = 0xffffffff; + input->vp_context.cs.attributes = 0xa09b; + /* Non-system desc (64bit), data, present, granularity, default */ + input->vp_context.ss.selector = __KERNEL_DS; + input->vp_context.ss.base = 0; + input->vp_context.ss.limit = 0xffffffff; + input->vp_context.ss.attributes = 0xc093; + + /* System desc (128bit), present, LDT */ + input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8; + input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt); + input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt); + input->vp_context.ldtr.attributes = 0x82; + + /* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */ + input->vp_context.tr.selector = GDT_ENTRY_TSS * 8; + input->vp_context.tr.base = hv_vtl_system_desc_base(tss); + input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss); + input->vp_context.tr.attributes = 0x8b; + + status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL); + + if (!hv_result_success(status) && + hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) { + pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]", + target_vp_index, status); + ret = -EINVAL; + goto free_lock; + } + + status = hv_do_hypercall(HVCALL_START_VP, input, NULL); + + if (!hv_result_success(status)) { + pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n", + target_vp_index, status); + ret = -EINVAL; + } + +free_lock: + local_irq_restore(irq_flags); + + return ret; +} + +static int hv_vtl_wakeup_secondary_cpu(u32 apicid, unsigned long start_eip, unsigned int cpu) +{ + int vp_index; + + pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid); + vp_index = hv_apicid_to_vp_index(apicid); + + if (vp_index < 0) { + pr_err("Couldn't find CPU with APIC ID %d\n", apicid); + return -EINVAL; + } + if (vp_index > ms_hyperv.max_vp_index) { + pr_err("Invalid CPU id %d for APIC ID %d\n", vp_index, apicid); + return -EINVAL; + } + + return hv_vtl_bringup_vcpu(vp_index, cpu, start_eip); +} + +int __init hv_vtl_early_init(void) +{ + machine_ops.emergency_restart = hv_vtl_emergency_restart; + machine_ops.restart = hv_vtl_restart; + + /* + * `boot_cpu_has` returns the runtime feature support, + * and here is the earliest it can be used. + */ + if (cpu_feature_enabled(X86_FEATURE_XSAVE)) + panic("XSAVE has to be disabled as it is not supported by this module.\n" + "Please add 'noxsave' to the kernel command line.\n"); + + real_mode_header = &hv_vtl_real_mode_header; + apic_update_callback(wakeup_secondary_cpu_64, hv_vtl_wakeup_secondary_cpu); + + return 0; +} + +DEFINE_STATIC_CALL_NULL(__mshv_vtl_return_hypercall, void (*)(void)); + +void mshv_vtl_return_call_init(u64 vtl_return_offset) +{ + static_call_update(__mshv_vtl_return_hypercall, + (void *)((u8 *)hv_hypercall_pg + vtl_return_offset)); +} +EXPORT_SYMBOL(mshv_vtl_return_call_init); + +void mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) +{ + struct hv_vp_assist_page *hvp; + + hvp = hv_vp_assist_page[smp_processor_id()]; + hvp->vtl_ret_x64rax = vtl0->rax; + hvp->vtl_ret_x64rcx = vtl0->rcx; + + kernel_fpu_begin_mask(0); + fxrstor(&vtl0->fx_state); + __mshv_vtl_return_call(vtl0); + fxsave(&vtl0->fx_state); + kernel_fpu_end(); +} +EXPORT_SYMBOL(mshv_vtl_return_call); diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c new file mode 100644 index 000000000000..c3ba12b1bc07 --- /dev/null +++ b/arch/x86/hyperv/irqdomain.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Irqdomain for Linux to run as the root partition on Microsoft Hypervisor. + * + * Authors: + * Sunil Muthuswamy <sunilmut@microsoft.com> + * Wei Liu <wei.liu@kernel.org> + */ + +#include <linux/pci.h> +#include <linux/irq.h> +#include <linux/export.h> +#include <linux/irqchip/irq-msi-lib.h> +#include <asm/mshyperv.h> + +static int hv_map_interrupt(union hv_device_id device_id, bool level, + int cpu, int vector, struct hv_interrupt_entry *entry) +{ + struct hv_input_map_device_interrupt *input; + struct hv_output_map_device_interrupt *output; + struct hv_device_interrupt_descriptor *intr_desc; + unsigned long flags; + u64 status; + int nr_bank, var_size; + + local_irq_save(flags); + + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = *this_cpu_ptr(hyperv_pcpu_output_arg); + + intr_desc = &input->interrupt_descriptor; + memset(input, 0, sizeof(*input)); + input->partition_id = hv_current_partition_id; + input->device_id = device_id.as_uint64; + intr_desc->interrupt_type = HV_X64_INTERRUPT_TYPE_FIXED; + intr_desc->vector_count = 1; + intr_desc->target.vector = vector; + + if (level) + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_LEVEL; + else + intr_desc->trigger_mode = HV_INTERRUPT_TRIGGER_MODE_EDGE; + + intr_desc->target.vp_set.valid_bank_mask = 0; + intr_desc->target.vp_set.format = HV_GENERIC_SET_SPARSE_4K; + nr_bank = cpumask_to_vpset(&(intr_desc->target.vp_set), cpumask_of(cpu)); + if (nr_bank < 0) { + local_irq_restore(flags); + pr_err("%s: unable to generate VP set\n", __func__); + return -EINVAL; + } + intr_desc->target.flags = HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET; + + /* + * var-sized hypercall, var-size starts after vp_mask (thus + * vp_set.format does not count, but vp_set.valid_bank_mask + * does). + */ + var_size = nr_bank + 1; + + status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_INTERRUPT, 0, var_size, + input, output); + *entry = output->interrupt_entry; + + local_irq_restore(flags); + + if (!hv_result_success(status)) + hv_status_err(status, "\n"); + + return hv_result_to_errno(status); +} + +static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry) +{ + unsigned long flags; + struct hv_input_unmap_device_interrupt *input; + struct hv_interrupt_entry *intr_entry; + u64 status; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + memset(input, 0, sizeof(*input)); + intr_entry = &input->interrupt_entry; + input->partition_id = hv_current_partition_id; + input->device_id = id; + *intr_entry = *old_entry; + + status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL); + local_irq_restore(flags); + + if (!hv_result_success(status)) + hv_status_err(status, "\n"); + + return hv_result_to_errno(status); +} + +#ifdef CONFIG_PCI_MSI +struct rid_data { + struct pci_dev *bridge; + u32 rid; +}; + +static int get_rid_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rid_data *rd = data; + u8 bus = PCI_BUS_NUM(rd->rid); + + if (pdev->bus->number != bus || PCI_BUS_NUM(alias) != bus) { + rd->bridge = pdev; + rd->rid = alias; + } + + return 0; +} + +static union hv_device_id hv_build_pci_dev_id(struct pci_dev *dev) +{ + union hv_device_id dev_id; + struct rid_data data = { + .bridge = NULL, + .rid = PCI_DEVID(dev->bus->number, dev->devfn) + }; + + pci_for_each_dma_alias(dev, get_rid_cb, &data); + + dev_id.as_uint64 = 0; + dev_id.device_type = HV_DEVICE_TYPE_PCI; + dev_id.pci.segment = pci_domain_nr(dev->bus); + + dev_id.pci.bdf.bus = PCI_BUS_NUM(data.rid); + dev_id.pci.bdf.device = PCI_SLOT(data.rid); + dev_id.pci.bdf.function = PCI_FUNC(data.rid); + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_NONE; + + if (data.bridge) { + int pos; + + /* + * Microsoft Hypervisor requires a bus range when the bridge is + * running in PCI-X mode. + * + * To distinguish conventional vs PCI-X bridge, we can check + * the bridge's PCI-X Secondary Status Register, Secondary Bus + * Mode and Frequency bits. See PCI Express to PCI/PCI-X Bridge + * Specification Revision 1.0 5.2.2.1.3. + * + * Value zero means it is in conventional mode, otherwise it is + * in PCI-X mode. + */ + + pos = pci_find_capability(data.bridge, PCI_CAP_ID_PCIX); + if (pos) { + u16 status; + + pci_read_config_word(data.bridge, pos + + PCI_X_BRIDGE_SSTATUS, &status); + + if (status & PCI_X_SSTATUS_FREQ) { + /* Non-zero, PCI-X mode */ + u8 sec_bus, sub_bus; + + dev_id.pci.source_shadow = HV_SOURCE_SHADOW_BRIDGE_BUS_RANGE; + + pci_read_config_byte(data.bridge, PCI_SECONDARY_BUS, &sec_bus); + dev_id.pci.shadow_bus_range.secondary_bus = sec_bus; + pci_read_config_byte(data.bridge, PCI_SUBORDINATE_BUS, &sub_bus); + dev_id.pci.shadow_bus_range.subordinate_bus = sub_bus; + } + } + } + + return dev_id; +} + +/** + * hv_map_msi_interrupt() - "Map" the MSI IRQ in the hypervisor. + * @data: Describes the IRQ + * @out_entry: Hypervisor (MSI) interrupt entry (can be NULL) + * + * Map the IRQ in the hypervisor by issuing a MAP_DEVICE_INTERRUPT hypercall. + * + * Return: 0 on success, -errno on failure + */ +int hv_map_msi_interrupt(struct irq_data *data, + struct hv_interrupt_entry *out_entry) +{ + struct irq_cfg *cfg = irqd_cfg(data); + struct hv_interrupt_entry dummy; + union hv_device_id device_id; + struct msi_desc *msidesc; + struct pci_dev *dev; + int cpu; + + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + device_id = hv_build_pci_dev_id(dev); + cpu = cpumask_first(irq_data_get_effective_affinity_mask(data)); + + return hv_map_interrupt(device_id, false, cpu, cfg->vector, + out_entry ? out_entry : &dummy); +} +EXPORT_SYMBOL_GPL(hv_map_msi_interrupt); + +static inline void entry_to_msi_msg(struct hv_interrupt_entry *entry, struct msi_msg *msg) +{ + /* High address is always 0 */ + msg->address_hi = 0; + msg->address_lo = entry->msi_entry.address.as_uint32; + msg->data = entry->msi_entry.data.as_uint32; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry); +static void hv_irq_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct hv_interrupt_entry *stored_entry; + struct irq_cfg *cfg = irqd_cfg(data); + struct msi_desc *msidesc; + struct pci_dev *dev; + int ret; + + msidesc = irq_data_get_msi_desc(data); + dev = msi_desc_to_pci_dev(msidesc); + + if (!cfg) { + pr_debug("%s: cfg is NULL", __func__); + return; + } + + if (data->chip_data) { + /* + * This interrupt is already mapped. Let's unmap first. + * + * We don't use retarget interrupt hypercalls here because + * Microsoft Hypervisor doesn't allow root to change the vector + * or specify VPs outside of the set that is initially used + * during mapping. + */ + stored_entry = data->chip_data; + data->chip_data = NULL; + + ret = hv_unmap_msi_interrupt(dev, stored_entry); + + kfree(stored_entry); + + if (ret) + return; + } + + stored_entry = kzalloc(sizeof(*stored_entry), GFP_ATOMIC); + if (!stored_entry) { + pr_debug("%s: failed to allocate chip data\n", __func__); + return; + } + + ret = hv_map_msi_interrupt(data, stored_entry); + if (ret) { + kfree(stored_entry); + return; + } + + data->chip_data = stored_entry; + entry_to_msi_msg(data->chip_data, msg); + + return; +} + +static int hv_unmap_msi_interrupt(struct pci_dev *dev, struct hv_interrupt_entry *old_entry) +{ + return hv_unmap_interrupt(hv_build_pci_dev_id(dev).as_uint64, old_entry); +} + +static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) +{ + struct hv_interrupt_entry old_entry; + struct msi_msg msg; + + if (!irqd->chip_data) { + pr_debug("%s: no chip data\n!", __func__); + return; + } + + old_entry = *(struct hv_interrupt_entry *)irqd->chip_data; + entry_to_msi_msg(&old_entry, &msg); + + kfree(irqd->chip_data); + irqd->chip_data = NULL; + + (void)hv_unmap_msi_interrupt(dev, &old_entry); +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip hv_pci_msi_controller = { + .name = "HV-PCI-MSI", + .irq_ack = irq_chip_ack_parent, + .irq_compose_msi_msg = hv_irq_compose_msi_msg, + .irq_set_affinity = irq_chip_set_affinity_parent, +}; + +static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; + + info->ops->msi_prepare = pci_msi_prepare; + + return true; +} + +#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) +#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + +static struct msi_parent_ops hv_msi_parent_ops = { + .supported_flags = HV_MSI_FLAGS_SUPPORTED, + .required_flags = HV_MSI_FLAGS_REQUIRED, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .prefix = "HV-", + .init_dev_msi_info = hv_init_dev_msi_info, +}; + +static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + /* + * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except + * entry_to_msi_msg() should be in here. + */ + + int ret; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); + if (ret) + return ret; + + for (int i = 0; i < nr_irqs; ++i) { + irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL, + handle_edge_irq, NULL, "edge"); + } + return 0; +} + +static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) +{ + for (int i = 0; i < nr_irqs; ++i) { + struct irq_data *irqd = irq_domain_get_irq_data(d, virq); + struct msi_desc *desc; + + desc = irq_data_get_msi_desc(irqd); + if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) + continue; + + hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); + } + irq_domain_free_irqs_top(d, virq, nr_irqs); +} + +static const struct irq_domain_ops hv_msi_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = hv_msi_domain_alloc, + .free = hv_msi_domain_free, +}; + +struct irq_domain * __init hv_create_pci_msi_domain(void) +{ + struct irq_domain *d = NULL; + + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), + .ops = &hv_msi_domain_ops, + .parent = x86_vector_domain, + }; + + if (info.fwnode) + d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); + + /* No point in going further if we can't get an irq domain */ + BUG_ON(!d); + + return d; +} + +#endif /* CONFIG_PCI_MSI */ + +int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_unmap_interrupt(device_id.as_uint64, entry); +} +EXPORT_SYMBOL_GPL(hv_unmap_ioapic_interrupt); + +int hv_map_ioapic_interrupt(int ioapic_id, bool level, int cpu, int vector, + struct hv_interrupt_entry *entry) +{ + union hv_device_id device_id; + + device_id.as_uint64 = 0; + device_id.device_type = HV_DEVICE_TYPE_IOAPIC; + device_id.ioapic.ioapic_id = (u8)ioapic_id; + + return hv_map_interrupt(device_id, level, cpu, vector, entry); +} +EXPORT_SYMBOL_GPL(hv_map_ioapic_interrupt); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c new file mode 100644 index 000000000000..651771534cae --- /dev/null +++ b/arch/x86/hyperv/ivm.c @@ -0,0 +1,945 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hyper-V Isolation VM interface with paravisor and hypervisor + * + * Author: + * Tianyu Lan <Tianyu.Lan@microsoft.com> + */ + +#include <linux/bitfield.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/export.h> +#include <asm/svm.h> +#include <asm/sev.h> +#include <asm/io.h> +#include <asm/coco.h> +#include <asm/mem_encrypt.h> +#include <asm/set_memory.h> +#include <asm/mshyperv.h> +#include <asm/hypervisor.h> +#include <asm/mtrr.h> +#include <asm/io_apic.h> +#include <asm/realmode.h> +#include <asm/e820/api.h> +#include <asm/desc.h> +#include <asm/msr.h> +#include <uapi/asm/vmx.h> + +#ifdef CONFIG_AMD_MEM_ENCRYPT + +#define GHCB_USAGE_HYPERV_CALL 1 + +union hv_ghcb { + struct ghcb ghcb; + struct { + u64 hypercalldata[509]; + u64 outputgpa; + union { + union { + struct { + u32 callcode : 16; + u32 isfast : 1; + u32 reserved1 : 14; + u32 isnested : 1; + u32 countofelements : 12; + u32 reserved2 : 4; + u32 repstartindex : 12; + u32 reserved3 : 4; + }; + u64 asuint64; + } hypercallinput; + union { + struct { + u16 callstatus; + u16 reserved1; + u32 elementsprocessed : 12; + u32 reserved2 : 20; + }; + u64 asunit64; + } hypercalloutput; + }; + u64 reserved2; + } hypercall; +} __packed __aligned(HV_HYP_PAGE_SIZE); + +/* Only used in an SNP VM with the paravisor */ +static u16 hv_ghcb_version __ro_after_init; + +/* Functions only used in an SNP VM with the paravisor go here. */ +u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + u64 status; + + if (!hv_ghcb_pg) + return -EFAULT; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return -EFAULT; + } + + hv_ghcb->ghcb.protocol_version = GHCB_PROTOCOL_MAX; + hv_ghcb->ghcb.ghcb_usage = GHCB_USAGE_HYPERV_CALL; + + hv_ghcb->hypercall.outputgpa = (u64)output; + hv_ghcb->hypercall.hypercallinput.asuint64 = 0; + hv_ghcb->hypercall.hypercallinput.callcode = control; + + if (input_size) + memcpy(hv_ghcb->hypercall.hypercalldata, input, input_size); + + VMGEXIT(); + + hv_ghcb->ghcb.ghcb_usage = 0xffffffff; + memset(hv_ghcb->ghcb.save.valid_bitmap, 0, + sizeof(hv_ghcb->ghcb.save.valid_bitmap)); + + status = hv_ghcb->hypercall.hypercalloutput.callstatus; + + local_irq_restore(flags); + + return status; +} + +static inline u64 rd_ghcb_msr(void) +{ + return native_rdmsrq(MSR_AMD64_SEV_ES_GHCB); +} + +static inline void wr_ghcb_msr(u64 val) +{ + native_wrmsrq(MSR_AMD64_SEV_ES_GHCB, val); +} + +static enum es_result hv_ghcb_hv_call(struct ghcb *ghcb, u64 exit_code, + u64 exit_info_1, u64 exit_info_2) +{ + /* Fill in protocol and format specifiers */ + ghcb->protocol_version = hv_ghcb_version; + ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; + + ghcb_set_sw_exit_code(ghcb, exit_code); + ghcb_set_sw_exit_info_1(ghcb, exit_info_1); + ghcb_set_sw_exit_info_2(ghcb, exit_info_2); + + VMGEXIT(); + + if (ghcb->save.sw_exit_info_1 & GENMASK_ULL(31, 0)) + return ES_VMM_ERROR; + else + return ES_OK; +} + +void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason) +{ + u64 val = GHCB_MSR_TERM_REQ; + + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); + + /* Request Guest Termination from Hypervisor */ + wr_ghcb_msr(val); + VMGEXIT(); + + while (true) + asm volatile("hlt\n" : : : "memory"); +} + +bool hv_ghcb_negotiate_protocol(void) +{ + u64 ghcb_gpa; + u64 val; + + /* Save ghcb page gpa. */ + ghcb_gpa = rd_ghcb_msr(); + + /* Do the GHCB protocol version negotiation */ + wr_ghcb_msr(GHCB_MSR_SEV_INFO_REQ); + VMGEXIT(); + val = rd_ghcb_msr(); + + if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) + return false; + + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) + return false; + + hv_ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), + GHCB_PROTOCOL_MAX); + + /* Write ghcb page back after negotiating protocol. */ + wr_ghcb_msr(ghcb_gpa); + VMGEXIT(); + + return true; +} + +static void hv_ghcb_msr_write(u64 msr, u64 value) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + + if (!hv_ghcb_pg) + return; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return; + } + + ghcb_set_rcx(&hv_ghcb->ghcb, msr); + ghcb_set_rax(&hv_ghcb->ghcb, lower_32_bits(value)); + ghcb_set_rdx(&hv_ghcb->ghcb, upper_32_bits(value)); + + if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 1, 0)) + pr_warn("Fail to write msr via ghcb %llx.\n", msr); + + local_irq_restore(flags); +} + +static void hv_ghcb_msr_read(u64 msr, u64 *value) +{ + union hv_ghcb *hv_ghcb; + void **ghcb_base; + unsigned long flags; + + /* Check size of union hv_ghcb here. */ + BUILD_BUG_ON(sizeof(union hv_ghcb) != HV_HYP_PAGE_SIZE); + + if (!hv_ghcb_pg) + return; + + WARN_ON(in_nmi()); + + local_irq_save(flags); + ghcb_base = (void **)this_cpu_ptr(hv_ghcb_pg); + hv_ghcb = (union hv_ghcb *)*ghcb_base; + if (!hv_ghcb) { + local_irq_restore(flags); + return; + } + + ghcb_set_rcx(&hv_ghcb->ghcb, msr); + if (hv_ghcb_hv_call(&hv_ghcb->ghcb, SVM_EXIT_MSR, 0, 0)) + pr_warn("Fail to read msr via ghcb %llx.\n", msr); + else + *value = (u64)lower_32_bits(hv_ghcb->ghcb.save.rax) + | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); + local_irq_restore(flags); +} + +/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */ +static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE); +static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa); + +/* Functions only used in an SNP VM without the paravisor go here. */ + +#define hv_populate_vmcb_seg(seg, gdtr_base) \ +do { \ + if (seg.selector) { \ + seg.base = 0; \ + seg.limit = HV_AP_SEGMENT_LIMIT; \ + seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \ + seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \ + } \ +} while (0) \ + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu) +{ + struct sev_es_save_area *vmsa = (struct sev_es_save_area *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + struct sev_es_save_area *cur_vmsa; + struct desc_ptr gdtr; + u64 ret, retry = 5; + struct hv_enable_vp_vtl *start_vp_input; + unsigned long flags; + int vp_index; + + if (!vmsa) + return -ENOMEM; + + /* Find the Hyper-V VP index which might be not the same as APIC ID */ + vp_index = hv_apicid_to_vp_index(apic_id); + if (vp_index < 0 || vp_index > ms_hyperv.max_vp_index) + return -EINVAL; + + native_store_gdt(&gdtr); + + vmsa->gdtr.base = gdtr.address; + vmsa->gdtr.limit = gdtr.size; + + asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); + hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); + + asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); + hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); + + asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); + hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); + + asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); + hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); + + vmsa->efer = native_read_msr(MSR_EFER); + + vmsa->cr4 = native_read_cr4(); + vmsa->cr3 = __native_read_cr3(); + vmsa->cr0 = native_read_cr0(); + + vmsa->xcr0 = 1; + vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT; + vmsa->rip = (u64)secondary_startup_64_no_verify; + vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE]; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); + free_page((u64)vmsa); + return ret; + } + + local_irq_save(flags); + start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; + memset(start_vp_input, 0, sizeof(*start_vp_input)); + start_vp_input->partition_id = -1; + start_vp_input->vp_index = vp_index; + start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; + *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; + + do { + ret = hv_do_hypercall(HVCALL_START_VP, + start_vp_input, NULL); + } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--); + + local_irq_restore(flags); + + if (!hv_result_success(ret)) { + pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret); + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + cur_vmsa = per_cpu(hv_sev_vmsa, cpu); + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(hv_sev_vmsa, cpu) = vmsa; + + return ret; +} + +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) +{ + u64 hv_status; + + register u64 __r8 asm("r8") = param2; + asm volatile("vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); + + return hv_status; +} + +#else +static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} +static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_INTEL_TDX_GUEST +static void hv_tdx_msr_write(u64 msr, u64 val) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_WRITE, + .r12 = msr, + .r13 = val, + }; + + u64 ret = __tdx_hypercall(&args); + + WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret); +} + +static void hv_tdx_msr_read(u64 msr, u64 *val) +{ + struct tdx_module_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_READ, + .r12 = msr, + }; + + u64 ret = __tdx_hypercall(&args); + + if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) + *val = 0; + else + *val = args.r11; +} + +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + struct tdx_module_args args = { }; + + args.r10 = control; + args.rdx = param1; + args.r8 = param2; + + (void)__tdx_hypercall(&args); + + return args.r11; +} + +#else +static inline void hv_tdx_msr_write(u64 msr, u64 value) {} +static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_ivm_msr_write(u64 msr, u64 value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_write(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_write(msr, value); +} + +void hv_ivm_msr_read(u64 msr, u64 *value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_read(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_read(msr, value); +} + +/* + * Keep track of the PFN regions which were shared with the host. The access + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). + */ +struct hv_enc_pfn_region { + struct list_head list; + u64 pfn; + int count; +}; + +static LIST_HEAD(hv_list_enc); +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); + +static int hv_list_enc_add(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + /* Check if the PFN already exists in some region first */ + list_for_each_entry(ent, &hv_list_enc, list) { + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) + /* Nothing to do - pfn is already in the list */ + goto unlock_done; + } + + /* + * Check if the PFN is adjacent to an existing region. Growing + * a region can make it adjacent to another one but merging is + * not (yet) implemented for simplicity. A PFN cannot be added + * to two regions to keep the logic in hv_list_enc_remove() + * correct. + */ + list_for_each_entry(ent, &hv_list_enc, list) { + if (ent->pfn + ent->count == pfn) { + /* Grow existing region up */ + ent->count++; + goto unlock_done; + } else if (pfn + 1 == ent->pfn) { + /* Grow existing region down */ + ent->pfn--; + ent->count++; + goto unlock_done; + } + } + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + /* No adjacent region found -- create a new one */ + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = pfn; + ent->count = 1; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +static int hv_list_enc_remove(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent, *t; + struct hv_enc_pfn_region new_region; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { + if (pfn == ent->pfn + ent->count - 1) { + /* Removing tail pfn */ + ent->count--; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn == ent->pfn) { + /* Removing head pfn */ + ent->count--; + ent->pfn++; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) { + /* + * Removing a pfn in the middle. Cut off the tail + * of the existing region and create a template for + * the new one. + */ + new_region.pfn = pfn + 1; + new_region.count = ent->count - (pfn - ent->pfn + 1); + ent->count = pfn - ent->pfn; + goto unlock_split; + } + + } +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + continue; + +unlock_split: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = new_region.pfn; + ent->count = new_region.count; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +/* Stop new private<->shared conversions */ +static void hv_vtom_kexec_begin(void) +{ + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + /* + * Crash kernel reaches here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +static void hv_vtom_kexec_finish(void) +{ + struct hv_gpa_range_for_visibility *input; + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 hv_status; + int cur, i; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + if (unlikely(!input)) + goto out; + + list_for_each_entry(ent, &hv_list_enc, list) { + for (i = 0, cur = 0; i < ent->count; i++) { + input->gpa_page_list[cur] = ent->pfn + i; + cur++; + + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; + input->reserved0 = 0; + input->reserved1 = 0; + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, + cur, 0, input, NULL); + WARN_ON_ONCE(!hv_result_success(hv_status)); + cur = 0; + } + } + + } + +out: + local_irq_restore(flags); +} + +/* + * hv_mark_gpa_visibility - Set pages visible to host via hvcall. + * + * In Isolation VM, all guest memory is encrypted from host and guest + * needs to set memory visible to host via hvcall before sharing memory + * with host. + */ +static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], + enum hv_mem_host_visibility visibility) +{ + struct hv_gpa_range_for_visibility *input; + u64 hv_status; + unsigned long flags; + int ret; + + /* no-op if partition isolation is not enabled */ + if (!hv_is_isolation_supported()) + return 0; + + if (count > HV_MAX_MODIFY_GPA_REP_COUNT) { + pr_err("Hyper-V: GPA count:%d exceeds supported:%lu\n", count, + HV_MAX_MODIFY_GPA_REP_COUNT); + return -EINVAL; + } + + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_remove(pfn, count); + else + ret = hv_list_enc_add(pfn, count); + if (ret) + return ret; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + if (unlikely(!input)) { + local_irq_restore(flags); + return -EINVAL; + } + + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = visibility; + input->reserved0 = 0; + input->reserved1 = 0; + memcpy((void *)input->gpa_page_list, pfn, count * sizeof(*pfn)); + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, count, + 0, input, NULL); + local_irq_restore(flags); + + if (hv_result_success(hv_status)) + return 0; + + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_add(pfn, count); + else + ret = hv_list_enc_remove(pfn, count); + /* + * There's no good way to recover from -ENOMEM here, the accounting is + * wrong either way. + */ + WARN_ON_ONCE(ret); + + return -EFAULT; +} + +/* + * When transitioning memory between encrypted and decrypted, the caller + * of set_memory_encrypted() or set_memory_decrypted() is responsible for + * ensuring that the memory isn't in use and isn't referenced while the + * transition is in progress. The transition has multiple steps, and the + * memory is in an inconsistent state until all steps are complete. A + * reference while the state is inconsistent could result in an exception + * that can't be cleanly fixed up. + * + * But the Linux kernel load_unaligned_zeropad() mechanism could cause a + * stray reference that can't be prevented by the caller, so Linux has + * specific code to handle this case. But when the #VC and #VE exceptions + * routed to a paravisor, the specific code doesn't work. To avoid this + * problem, mark the pages as "not present" while the transition is in + * progress. If load_unaligned_zeropad() causes a stray reference, a normal + * page fault is generated instead of #VC or #VE, and the page-fault-based + * handlers for load_unaligned_zeropad() resolve the reference. When the + * transition is complete, hv_vtom_set_host_visibility() marks the pages + * as "present" again. + */ +static int hv_vtom_clear_present(unsigned long kbuffer, int pagecount, bool enc) +{ + return set_memory_np(kbuffer, pagecount); +} + +/* + * hv_vtom_set_host_visibility - Set specified memory visible to host. + * + * In Isolation VM, all guest memory is encrypted from host and guest + * needs to set memory visible to host via hvcall before sharing memory + * with host. This function works as wrap of hv_mark_gpa_visibility() + * with memory base and size. + */ +static int hv_vtom_set_host_visibility(unsigned long kbuffer, int pagecount, bool enc) +{ + enum hv_mem_host_visibility visibility = enc ? + VMBUS_PAGE_NOT_VISIBLE : VMBUS_PAGE_VISIBLE_READ_WRITE; + u64 *pfn_array; + phys_addr_t paddr; + int i, pfn, err; + void *vaddr; + int ret = 0; + + pfn_array = kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + if (!pfn_array) { + ret = -ENOMEM; + goto err_set_memory_p; + } + + for (i = 0, pfn = 0; i < pagecount; i++) { + /* + * Use slow_virt_to_phys() because the PRESENT bit has been + * temporarily cleared in the PTEs. slow_virt_to_phys() works + * without the PRESENT bit while virt_to_hvpfn() or similar + * does not. + */ + vaddr = (void *)kbuffer + (i * HV_HYP_PAGE_SIZE); + paddr = slow_virt_to_phys(vaddr); + pfn_array[pfn] = paddr >> HV_HYP_PAGE_SHIFT; + pfn++; + + if (pfn == HV_MAX_MODIFY_GPA_REP_COUNT || i == pagecount - 1) { + ret = hv_mark_gpa_visibility(pfn, pfn_array, + visibility); + if (ret) + goto err_free_pfn_array; + pfn = 0; + } + } + +err_free_pfn_array: + kfree(pfn_array); + +err_set_memory_p: + /* + * Set the PTE PRESENT bits again to revert what hv_vtom_clear_present() + * did. Do this even if there is an error earlier in this function in + * order to avoid leaving the memory range in a "broken" state. Setting + * the PRESENT bits shouldn't fail, but return an error if it does. + */ + err = set_memory_p(kbuffer, pagecount); + if (err && !ret) + ret = err; + + return ret; +} + +static bool hv_vtom_tlb_flush_required(bool private) +{ + /* + * Since hv_vtom_clear_present() marks the PTEs as "not present" + * and flushes the TLB, they can't be in the TLB. That makes the + * flush controlled by this function redundant, so return "false". + */ + return false; +} + +static bool hv_vtom_cache_flush_required(void) +{ + return false; +} + +static bool hv_is_private_mmio(u64 addr) +{ + /* + * Hyper-V always provides a single IO-APIC in a guest VM. + * When a paravisor is used, it is emulated by the paravisor + * in the guest context and must be mapped private. + */ + if (addr >= HV_IOAPIC_BASE_ADDRESS && + addr < (HV_IOAPIC_BASE_ADDRESS + PAGE_SIZE)) + return true; + + /* Same with a vTPM */ + if (addr >= VTPM_BASE_ADDRESS && + addr < (VTPM_BASE_ADDRESS + PAGE_SIZE)) + return true; + + return false; +} + +void __init hv_vtom_init(void) +{ + enum hv_isolation_type type = hv_get_isolation_type(); + + switch (type) { + case HV_ISOLATION_TYPE_VBS: + fallthrough; + /* + * By design, a VM using vTOM doesn't see the SEV setting, + * so SEV initialization is bypassed and sev_status isn't set. + * Set it here to indicate a vTOM VM. + * + * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is + * defined as 0ULL, to which we can't assigned a value. + */ +#ifdef CONFIG_AMD_MEM_ENCRYPT + case HV_ISOLATION_TYPE_SNP: + sev_status = MSR_AMD64_SNP_VTOM; + cc_vendor = CC_VENDOR_AMD; + break; +#endif + + case HV_ISOLATION_TYPE_TDX: + cc_vendor = CC_VENDOR_INTEL; + break; + + default: + panic("hv_vtom_init: unsupported isolation type %d\n", type); + } + + cc_set_mask(ms_hyperv.shared_gpa_boundary); + physical_mask &= ms_hyperv.shared_gpa_boundary - 1; + + x86_platform.hyper.is_private_mmio = hv_is_private_mmio; + x86_platform.guest.enc_cache_flush_required = hv_vtom_cache_flush_required; + x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; + x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; + x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; + + /* Set WB as the default cache mode. */ + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); +} + +#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ + +enum hv_isolation_type hv_get_isolation_type(void) +{ + if (!(ms_hyperv.priv_high & HV_ISOLATION)) + return HV_ISOLATION_TYPE_NONE; + return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b); +} +EXPORT_SYMBOL_GPL(hv_get_isolation_type); + +/* + * hv_is_isolation_supported - Check system runs in the Hyper-V + * isolation VM. + */ +bool hv_is_isolation_supported(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + return false; + + if (!hypervisor_is_type(X86_HYPER_MS_HYPERV)) + return false; + + return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE; +} + +DEFINE_STATIC_KEY_FALSE(isolation_type_snp); + +/* + * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based + * isolation VM. + */ +bool hv_isolation_type_snp(void) +{ + return static_branch_unlikely(&isolation_type_snp); +} + +DEFINE_STATIC_KEY_FALSE(isolation_type_tdx); +/* + * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based + * isolated VM. + */ +bool hv_isolation_type_tdx(void) +{ + return static_branch_unlikely(&isolation_type_tdx); +} diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index e65d7fe6489f..cfcb60468b01 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -1,6 +1,5 @@ #define pr_fmt(fmt) "Hyper-V: " fmt -#include <linux/hyperv.h> #include <linux/log2.h> #include <linux/slab.h> #include <linux/types.h> @@ -37,12 +36,14 @@ static inline int fill_gva_list(u64 gva_list[], int offset, * Lower 12 bits encode the number of additional * pages to flush (in addition to the 'cur' page). */ - if (diff >= HV_TLB_FLUSH_UNIT) + if (diff >= HV_TLB_FLUSH_UNIT) { gva_list[gva_n] |= ~PAGE_MASK; - else if (diff) + cur += HV_TLB_FLUSH_UNIT; + } else if (diff) { gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + cur = end; + } - cur += HV_TLB_FLUSH_UNIT; gva_n++; } while (cur < end); @@ -50,29 +51,28 @@ static inline int fill_gva_list(u64 gva_list[], int offset, return gva_n - offset; } -static void hyperv_flush_tlb_others(const struct cpumask *cpus, - const struct flush_tlb_info *info) +static bool cpu_is_lazy(int cpu) +{ + return per_cpu(cpu_tlbstate_shared.is_lazy, cpu); +} + +static void hyperv_flush_tlb_multi(const struct cpumask *cpus, + const struct flush_tlb_info *info) { int cpu, vcpu, gva_n, max_gvas; - struct hv_tlb_flush **flush_pcpu; struct hv_tlb_flush *flush; - u64 status = U64_MAX; + u64 status; unsigned long flags; + bool do_lazy = !info->freed_tables; - trace_hyperv_mmu_flush_tlb_others(cpus, info); + trace_hyperv_mmu_flush_tlb_multi(cpus, info); if (!hv_hypercall_pg) goto do_native; - if (cpumask_empty(cpus)) - return; - local_irq_save(flags); - flush_pcpu = (struct hv_tlb_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -107,10 +107,14 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, * must. We will also check all VP numbers when walking the * supplied CPU set to remain correct in all cases. */ - if (hv_cpu_number_to_vp_number(cpumask_last(cpus)) >= 64) + cpu = cpumask_last(cpus); + + if (cpu < nr_cpumask_bits && hv_cpu_number_to_vp_number(cpu) >= 64) goto do_ex_hypercall; for_each_cpu(cpu, cpus) { + if (do_lazy && cpu_is_lazy(cpu)) + continue; vcpu = hv_cpu_number_to_vp_number(cpu); if (vcpu == VP_INVAL) { local_irq_restore(flags); @@ -123,6 +127,12 @@ static void hyperv_flush_tlb_others(const struct cpumask *cpus, __set_bit(vcpu, (unsigned long *) &flush->processor_mask); } + + /* nothing to flush if 'processor_mask' ends up being empty */ + if (!flush->processor_mask) { + local_irq_restore(flags); + return; + } } /* @@ -153,27 +163,23 @@ do_ex_hypercall: check_status: local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) return; do_native: - native_flush_tlb_others(cpus, info); + native_flush_tlb_multi(cpus, info); } static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, const struct flush_tlb_info *info) { int nr_bank = 0, max_gvas, gva_n; - struct hv_tlb_flush_ex **flush_pcpu; struct hv_tlb_flush_ex *flush; u64 status; if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) - return U64_MAX; - - flush_pcpu = (struct hv_tlb_flush_ex **) - this_cpu_ptr(hyperv_pcpu_input_arg); + return HV_STATUS_INVALID_PARAMETER; - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (info->mm) { /* @@ -191,13 +197,18 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus, flush->hv_vp_set.valid_bank_mask = 0; flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K; - nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus); + nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus, + info->freed_tables ? NULL : cpu_is_lazy); if (nr_bank < 0) - return U64_MAX; + return HV_STATUS_INVALID_PARAMETER; /* * We can flush not more than max_gvas with one hypercall. Flush the * whole address space if we were asked to do more. + * + * For these hypercalls, Hyper-V treats the valid_bank_mask field + * of flush->hv_vp_set as part of the fixed size input header. + * So the variable input header size is equal to nr_bank. */ max_gvas = (PAGE_SIZE - sizeof(*flush) - nr_bank * @@ -231,6 +242,5 @@ void hyperv_setup_mmu_ops(void) return; pr_info("Using hypercall for remote TLB flush\n"); - pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; } diff --git a/arch/x86/hyperv/mshv-asm-offsets.c b/arch/x86/hyperv/mshv-asm-offsets.c new file mode 100644 index 000000000000..882c1db6df16 --- /dev/null +++ b/arch/x86/hyperv/mshv-asm-offsets.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain <namjain@microsoft.com> + */ +#define COMPILE_OFFSETS + +#include <linux/kbuild.h> +#include <asm/mshyperv.h> + +static void __used common(void) +{ + if (IS_ENABLED(CONFIG_HYPERV_VTL_MODE)) { + OFFSET(MSHV_VTL_CPU_CONTEXT_rax, mshv_vtl_cpu_context, rax); + OFFSET(MSHV_VTL_CPU_CONTEXT_rcx, mshv_vtl_cpu_context, rcx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdx, mshv_vtl_cpu_context, rdx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbx, mshv_vtl_cpu_context, rbx); + OFFSET(MSHV_VTL_CPU_CONTEXT_rbp, mshv_vtl_cpu_context, rbp); + OFFSET(MSHV_VTL_CPU_CONTEXT_rsi, mshv_vtl_cpu_context, rsi); + OFFSET(MSHV_VTL_CPU_CONTEXT_rdi, mshv_vtl_cpu_context, rdi); + OFFSET(MSHV_VTL_CPU_CONTEXT_r8, mshv_vtl_cpu_context, r8); + OFFSET(MSHV_VTL_CPU_CONTEXT_r9, mshv_vtl_cpu_context, r9); + OFFSET(MSHV_VTL_CPU_CONTEXT_r10, mshv_vtl_cpu_context, r10); + OFFSET(MSHV_VTL_CPU_CONTEXT_r11, mshv_vtl_cpu_context, r11); + OFFSET(MSHV_VTL_CPU_CONTEXT_r12, mshv_vtl_cpu_context, r12); + OFFSET(MSHV_VTL_CPU_CONTEXT_r13, mshv_vtl_cpu_context, r13); + OFFSET(MSHV_VTL_CPU_CONTEXT_r14, mshv_vtl_cpu_context, r14); + OFFSET(MSHV_VTL_CPU_CONTEXT_r15, mshv_vtl_cpu_context, r15); + OFFSET(MSHV_VTL_CPU_CONTEXT_cr2, mshv_vtl_cpu_context, cr2); + } +} diff --git a/arch/x86/hyperv/mshv_vtl_asm.S b/arch/x86/hyperv/mshv_vtl_asm.S new file mode 100644 index 000000000000..f595eefad9ab --- /dev/null +++ b/arch/x86/hyperv/mshv_vtl_asm.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Assembly level code for mshv_vtl VTL transition + * + * Copyright (c) 2025, Microsoft Corporation. + * + * Author: + * Naman Jain <namjain@microsoft.com> + */ + +#include <linux/linkage.h> +#include <linux/static_call_types.h> +#include <asm/asm.h> +#include <asm/asm-offsets.h> +#include <asm/frame.h> +#include "mshv-asm-offsets.h" + + .text + .section .noinstr.text, "ax" +/* + * void __mshv_vtl_return_call(struct mshv_vtl_cpu_context *vtl0) + * + * This function is used to context switch between different Virtual Trust Levels. + * It is marked as 'noinstr' to prevent against instrumentation and debugging facilities. + * NMIs aren't a problem because the NMI handler saves/restores CR2 specifically to guard + * against #PFs in NMI context clobbering the guest state. + */ +SYM_FUNC_START(__mshv_vtl_return_call) + /* Push callee save registers */ + pushq %rbp + mov %rsp, %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushq %rbx + + /* register switch to VTL0 clobbers all registers except rax/rcx */ + mov %_ASM_ARG1, %rax + + /* grab rbx/rbp/rsi/rdi/r8-r15 */ + mov MSHV_VTL_CPU_CONTEXT_rbx(%rax), %rbx + mov MSHV_VTL_CPU_CONTEXT_rbp(%rax), %rbp + mov MSHV_VTL_CPU_CONTEXT_rsi(%rax), %rsi + mov MSHV_VTL_CPU_CONTEXT_rdi(%rax), %rdi + mov MSHV_VTL_CPU_CONTEXT_r8(%rax), %r8 + mov MSHV_VTL_CPU_CONTEXT_r9(%rax), %r9 + mov MSHV_VTL_CPU_CONTEXT_r10(%rax), %r10 + mov MSHV_VTL_CPU_CONTEXT_r11(%rax), %r11 + mov MSHV_VTL_CPU_CONTEXT_r12(%rax), %r12 + mov MSHV_VTL_CPU_CONTEXT_r13(%rax), %r13 + mov MSHV_VTL_CPU_CONTEXT_r14(%rax), %r14 + mov MSHV_VTL_CPU_CONTEXT_r15(%rax), %r15 + + mov MSHV_VTL_CPU_CONTEXT_cr2(%rax), %rdx + mov %rdx, %cr2 + mov MSHV_VTL_CPU_CONTEXT_rdx(%rax), %rdx + + /* stash host registers on stack */ + pushq %rax + pushq %rcx + + xor %ecx, %ecx + + /* make a hypercall to switch VTL */ + call STATIC_CALL_TRAMP_STR(__mshv_vtl_return_hypercall) + + /* stash guest registers on stack, restore saved host copies */ + pushq %rax + pushq %rcx + mov 16(%rsp), %rcx + mov 24(%rsp), %rax + + mov %rdx, MSHV_VTL_CPU_CONTEXT_rdx(%rax) + mov %cr2, %rdx + mov %rdx, MSHV_VTL_CPU_CONTEXT_cr2(%rax) + pop MSHV_VTL_CPU_CONTEXT_rcx(%rax) + pop MSHV_VTL_CPU_CONTEXT_rax(%rax) + add $16, %rsp + + /* save rbx/rbp/rsi/rdi/r8-r15 */ + mov %rbx, MSHV_VTL_CPU_CONTEXT_rbx(%rax) + mov %rbp, MSHV_VTL_CPU_CONTEXT_rbp(%rax) + mov %rsi, MSHV_VTL_CPU_CONTEXT_rsi(%rax) + mov %rdi, MSHV_VTL_CPU_CONTEXT_rdi(%rax) + mov %r8, MSHV_VTL_CPU_CONTEXT_r8(%rax) + mov %r9, MSHV_VTL_CPU_CONTEXT_r9(%rax) + mov %r10, MSHV_VTL_CPU_CONTEXT_r10(%rax) + mov %r11, MSHV_VTL_CPU_CONTEXT_r11(%rax) + mov %r12, MSHV_VTL_CPU_CONTEXT_r12(%rax) + mov %r13, MSHV_VTL_CPU_CONTEXT_r13(%rax) + mov %r14, MSHV_VTL_CPU_CONTEXT_r14(%rax) + mov %r15, MSHV_VTL_CPU_CONTEXT_r15(%rax) + + /* pop callee-save registers r12-r15, rbx */ + pop %rbx + pop %r15 + pop %r14 + pop %r13 + pop %r12 + + pop %rbp + RET +SYM_FUNC_END(__mshv_vtl_return_call) +/* + * Make sure that static_call_key symbol: __SCK____mshv_vtl_return_hypercall is accessible here. + * Below code is inspired from __ADDRESSABLE(sym) macro. Symbol name is kept simple, to avoid + * naming it something like "__UNIQUE_ID_addressable___SCK____mshv_vtl_return_hypercall_662.0" + * which would otherwise have been generated by the macro. + */ + .section .discard.addressable,"aw" + .align 8 + .type mshv_vtl_return_sym, @object + .size mshv_vtl_return_sym, 8 +mshv_vtl_return_sym: + .quad __SCK____mshv_vtl_return_hypercall diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index dd0a843f766d..8ccbb7c4fc27 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -11,7 +11,8 @@ #include <linux/types.h> -#include <asm/hyperv-tlfs.h> +#include <linux/export.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <asm/tlbflush.h> @@ -19,7 +20,6 @@ int hyperv_flush_guest_mapping(u64 as) { - struct hv_guest_mapping_flush **flush_pcpu; struct hv_guest_mapping_flush *flush; u64 status; unsigned long flags; @@ -30,10 +30,7 @@ int hyperv_flush_guest_mapping(u64 as) local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush **) - this_cpu_ptr(hyperv_pcpu_input_arg); - - flush = *flush_pcpu; + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); if (unlikely(!flush)) { local_irq_restore(flags); @@ -47,7 +44,7 @@ int hyperv_flush_guest_mapping(u64 as) flush, NULL); local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) ret = 0; fault: @@ -90,9 +87,8 @@ EXPORT_SYMBOL_GPL(hyperv_fill_flush_guest_mapping_list); int hyperv_flush_guest_mapping_range(u64 as, hyperv_fill_flush_list_func fill_flush_list_func, void *data) { - struct hv_guest_mapping_flush_list **flush_pcpu; struct hv_guest_mapping_flush_list *flush; - u64 status = 0; + u64 status; unsigned long flags; int ret = -ENOTSUPP; int gpa_n = 0; @@ -102,10 +98,8 @@ int hyperv_flush_guest_mapping_range(u64 as, local_irq_save(flags); - flush_pcpu = (struct hv_guest_mapping_flush_list **) - this_cpu_ptr(hyperv_pcpu_input_arg); + flush = *this_cpu_ptr(hyperv_pcpu_input_arg); - flush = *flush_pcpu; if (unlikely(!flush)) { local_irq_restore(flags); goto fault; @@ -125,10 +119,10 @@ int hyperv_flush_guest_mapping_range(u64 as, local_irq_restore(flags); - if (!(status & HV_HYPERCALL_RESULT_MASK)) + if (hv_result_success(status)) ret = 0; else - ret = status; + ret = hv_result(status); fault: trace_hyperv_nested_flush_guest_mapping_range(as, ret); return ret; |
