summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-09-04 11:26:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-09-04 11:26:29 -0700
commit0b90c5637dfea8a08f87db5dd16000eb679013a3 (patch)
tree1e64c72282c411c563758a26d34cfe023a56668a
parente4f1b8202fb59c56a3de7642d50326923670513f (diff)
parent284930a0146ade1ce0250a1d3bae7a675af4bb3b (diff)
Merge tag 'hyperv-next-signed-20230902' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv updates from Wei Liu: - Support for SEV-SNP guests on Hyper-V (Tianyu Lan) - Support for TDX guests on Hyper-V (Dexuan Cui) - Use SBRM API in Hyper-V balloon driver (Mitchell Levy) - Avoid dereferencing ACPI root object handle in VMBus driver (Maciej Szmigiero) - A few misecllaneous fixes (Jiapeng Chong, Nathan Chancellor, Saurabh Sengar) * tag 'hyperv-next-signed-20230902' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (24 commits) x86/hyperv: Remove duplicate include x86/hyperv: Move the code in ivm.c around to avoid unnecessary ifdef's x86/hyperv: Remove hv_isolation_type_en_snp x86/hyperv: Use TDX GHCI to access some MSRs in a TDX VM with the paravisor Drivers: hv: vmbus: Bring the post_msg_page back for TDX VMs with the paravisor x86/hyperv: Introduce a global variable hyperv_paravisor_present Drivers: hv: vmbus: Support >64 VPs for a fully enlightened TDX/SNP VM x86/hyperv: Fix serial console interrupts for fully enlightened TDX guests Drivers: hv: vmbus: Support fully enlightened TDX guests x86/hyperv: Support hypercalls for fully enlightened TDX guests x86/hyperv: Add hv_isolation_type_tdx() to detect TDX guests x86/hyperv: Fix undefined reference to isolation_type_en_snp without CONFIG_HYPERV x86/hyperv: Add missing 'inline' to hv_snp_boot_ap() stub hv: hyperv.h: Replace one-element array with flexible-array member Drivers: hv: vmbus: Don't dereference ACPI root object handle x86/hyperv: Add hyperv-specific handling for VMMCALL under SEV-ES x86/hyperv: Add smp support for SEV-SNP guest clocksource: hyper-v: Mark hyperv tsc page unencrypted in sev-snp enlightened guest x86/hyperv: Use vmmcall to implement Hyper-V hypercall in sev-snp enlightened guest drivers: hv: Mark percpu hvcall input arg page unencrypted in SEV-SNP enlightened guest ...
-rw-r--r--arch/x86/hyperv/hv_apic.c15
-rw-r--r--arch/x86/hyperv/hv_init.c105
-rw-r--r--arch/x86/hyperv/ivm.c263
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h10
-rw-r--r--arch/x86/include/asm/mshyperv.h71
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c91
-rw-r--r--drivers/clocksource/hyperv_timer.c2
-rw-r--r--drivers/hv/connection.c16
-rw-r--r--drivers/hv/hv.c131
-rw-r--r--drivers/hv/hv_balloon.c82
-rw-r--r--drivers/hv/hv_common.c48
-rw-r--r--drivers/hv/hyperv_vmbus.h11
-rw-r--r--drivers/hv/vmbus_drv.c3
-rw-r--r--include/asm-generic/hyperv-tlfs.h1
-rw-r--r--include/asm-generic/mshyperv.h17
-rw-r--r--include/linux/hyperv.h6
16 files changed, 759 insertions, 113 deletions
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 187e13b15e9a..97bfe5f0531f 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -175,8 +175,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector,
(exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask)))
return true;
- if (!hv_hypercall_pg)
- return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
@@ -229,9 +232,15 @@ static bool __send_ipi_one(int cpu, int vector)
trace_hyperv_send_ipi_one(cpu, vector);
- if (!hv_hypercall_pg || (vp == VP_INVAL))
+ if (vp == VP_INVAL)
return false;
+ /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */
+ if (!hv_hypercall_pg) {
+ if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx())
+ return false;
+ }
+
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return false;
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 507d98331e7c..783ed339f341 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -19,6 +19,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/idtentry.h>
+#include <asm/set_memory.h>
#include <linux/kexec.h>
#include <linux/version.h>
#include <linux/vmalloc.h>
@@ -52,7 +53,7 @@ static int hyperv_init_ghcb(void)
void *ghcb_va;
void **ghcb_base;
- if (!hv_isolation_type_snp())
+ if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp())
return 0;
if (!hv_ghcb_pg)
@@ -80,7 +81,7 @@ static int hyperv_init_ghcb(void)
static int hv_cpu_init(unsigned int cpu)
{
union hv_vp_assist_msr_contents msr = { 0 };
- struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu];
+ struct hv_vp_assist_page **hvp;
int ret;
ret = hv_common_cpu_init(cpu);
@@ -90,6 +91,7 @@ static int hv_cpu_init(unsigned int cpu)
if (!hv_vp_assist_page)
return 0;
+ hvp = &hv_vp_assist_page[cpu];
if (hv_root_partition) {
/*
* For root partition we get the hypervisor provided VP assist
@@ -107,8 +109,21 @@ static int hv_cpu_init(unsigned int cpu)
* in hv_cpu_die(), otherwise a CPU may not be stopped in the
* case of CPU offlining and the VM will hang.
*/
- if (!*hvp)
+ if (!*hvp) {
*hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+
+ /*
+ * Hyper-V should never specify a VM that is a Confidential
+ * VM and also running in the root partition. Root partition
+ * is blocked to run in Confidential VM. So only decrypt assist
+ * page in non-root partition here.
+ */
+ if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1));
+ memset(*hvp, 0, PAGE_SIZE);
+ }
+ }
+
if (*hvp)
msr.pfn = vmalloc_to_pfn(*hvp);
@@ -379,6 +394,36 @@ static void __init hv_get_partition_id(void)
local_irq_restore(flags);
}
+static u8 __init get_vtl(void)
+{
+ u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS;
+ struct hv_get_vp_registers_input *input;
+ struct hv_get_vp_registers_output *output;
+ unsigned long flags;
+ u64 ret;
+
+ local_irq_save(flags);
+ input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ output = (struct hv_get_vp_registers_output *)input;
+
+ memset(input, 0, struct_size(input, element, 1));
+ input->header.partitionid = HV_PARTITION_ID_SELF;
+ input->header.vpindex = HV_VP_INDEX_SELF;
+ input->header.inputvtl = 0;
+ input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS;
+
+ ret = hv_do_hypercall(control, input, output);
+ if (hv_result_success(ret)) {
+ ret = output->as64.low & HV_X64_VTL_MASK;
+ } else {
+ pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret);
+ ret = 0;
+ }
+
+ local_irq_restore(flags);
+ return ret;
+}
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -399,14 +444,24 @@ void __init hyperv_init(void)
if (hv_common_init())
return;
- hv_vp_assist_page = kcalloc(num_possible_cpus(),
- sizeof(*hv_vp_assist_page), GFP_KERNEL);
+ /*
+ * The VP assist page is useless to a TDX guest: the only use we
+ * would have for it is lazy EOI, which can not be used with TDX.
+ */
+ if (hv_isolation_type_tdx())
+ hv_vp_assist_page = NULL;
+ else
+ hv_vp_assist_page = kcalloc(num_possible_cpus(),
+ sizeof(*hv_vp_assist_page),
+ GFP_KERNEL);
if (!hv_vp_assist_page) {
ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
- goto common_free;
+
+ if (!hv_isolation_type_tdx())
+ goto common_free;
}
- if (hv_isolation_type_snp()) {
+ if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
/* Negotiate GHCB Version. */
if (!hv_ghcb_negotiate_protocol())
hv_ghcb_terminate(SEV_TERM_SET_GEN,
@@ -426,12 +481,32 @@ void __init hyperv_init(void)
* Setup the hypercall page and enable hypercalls.
* 1. Register the guest ID
* 2. Enable the hypercall and register the hypercall page
+ *
+ * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg:
+ * when the hypercall input is a page, such a VM must pass a decrypted
+ * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page
+ * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present.
+ *
+ * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls,
+ * which are handled by the paravisor and the VM must use an encrypted
+ * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and
+ * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and
+ * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls:
+ * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8().
+ * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e.
+ * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg;
+ * instead, hv_post_message() uses the post_msg_page, which is decrypted
+ * in such a VM and is only used in such a VM.
*/
guest_id = hv_generate_guest_id(LINUX_VERSION_CODE);
wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
- /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+ /* With the paravisor, the VM must also write the ID via GHCB/GHCI */
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ goto skip_hypercall_pg_init;
hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START,
VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX,
@@ -472,6 +547,7 @@ void __init hyperv_init(void)
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
}
+skip_hypercall_pg_init:
/*
* Some versions of Hyper-V that provide IBT in guest VMs have a bug
* in that there's no ENDBR64 instruction at the entry to the
@@ -527,11 +603,15 @@ void __init hyperv_init(void)
/* Query the VMs extended capability once, so that it can be cached. */
hv_query_ext_cap(0);
+ /* Find the VTL */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp())
+ ms_hyperv.vtl = get_vtl();
+
return;
clean_guest_os_id:
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
cpuhp_remove_state(cpuhp);
free_ghcb_page:
free_percpu(hv_ghcb_pg);
@@ -552,7 +632,7 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
- hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
+ hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0);
/*
* Reset hypercall page reference before reset the page,
@@ -615,6 +695,9 @@ bool hv_is_hyperv_initialized(void)
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return false;
+ /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */
+ if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present)
+ return true;
/*
* Verify that earlier initialization succeeded by checking
* that the hypercall page is setup
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 28be6df88063..8c6bf07f7d2b 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -18,6 +18,11 @@
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
#include <asm/mtrr.h>
+#include <asm/io_apic.h>
+#include <asm/realmode.h>
+#include <asm/e820/api.h>
+#include <asm/desc.h>
+#include <uapi/asm/vmx.h>
#ifdef CONFIG_AMD_MEM_ENCRYPT
@@ -56,8 +61,10 @@ union hv_ghcb {
} hypercall;
} __packed __aligned(HV_HYP_PAGE_SIZE);
+/* Only used in an SNP VM with the paravisor */
static u16 hv_ghcb_version __ro_after_init;
+/* Functions only used in an SNP VM with the paravisor go here. */
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size)
{
union hv_ghcb *hv_ghcb;
@@ -175,7 +182,7 @@ bool hv_ghcb_negotiate_protocol(void)
return true;
}
-void hv_ghcb_msr_write(u64 msr, u64 value)
+static void hv_ghcb_msr_write(u64 msr, u64 value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -203,9 +210,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value)
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_write);
-void hv_ghcb_msr_read(u64 msr, u64 *value)
+static void hv_ghcb_msr_read(u64 msr, u64 *value)
{
union hv_ghcb *hv_ghcb;
void **ghcb_base;
@@ -235,7 +241,217 @@ void hv_ghcb_msr_read(u64 msr, u64 *value)
| ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(hv_ghcb_msr_read);
+
+/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */
+static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE);
+static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa);
+
+/* Functions only used in an SNP VM without the paravisor go here. */
+
+#define hv_populate_vmcb_seg(seg, gdtr_base) \
+do { \
+ if (seg.selector) { \
+ seg.base = 0; \
+ seg.limit = HV_AP_SEGMENT_LIMIT; \
+ seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \
+ seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \
+ } \
+} while (0) \
+
+static int snp_set_vmsa(void *va, bool vmsa)
+{
+ u64 attrs;
+
+ /*
+ * Running at VMPL0 allows the kernel to change the VMSA bit for a page
+ * using the RMPADJUST instruction. However, for the instruction to
+ * succeed it must target the permissions of a lesser privileged
+ * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST
+ * instruction in the AMD64 APM Volume 3).
+ */
+ attrs = 1;
+ if (vmsa)
+ attrs |= RMPADJUST_VMSA_PAGE_BIT;
+
+ return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs);
+}
+
+static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa)
+{
+ int err;
+
+ err = snp_set_vmsa(vmsa, false);
+ if (err)
+ pr_err("clear VMSA page failed (%u), leaking page\n", err);
+ else
+ free_page((unsigned long)vmsa);
+}
+
+int hv_snp_boot_ap(int cpu, unsigned long start_ip)
+{
+ struct sev_es_save_area *vmsa = (struct sev_es_save_area *)
+ __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ struct sev_es_save_area *cur_vmsa;
+ struct desc_ptr gdtr;
+ u64 ret, retry = 5;
+ struct hv_enable_vp_vtl *start_vp_input;
+ unsigned long flags;
+
+ if (!vmsa)
+ return -ENOMEM;
+
+ native_store_gdt(&gdtr);
+
+ vmsa->gdtr.base = gdtr.address;
+ vmsa->gdtr.limit = gdtr.size;
+
+ asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector));
+ hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base);
+
+ asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector));
+ hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base);
+
+ asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector));
+ hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base);
+
+ asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector));
+ hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base);
+
+ vmsa->efer = native_read_msr(MSR_EFER);
+
+ asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4));
+ asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3));
+ asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0));
+
+ vmsa->xcr0 = 1;
+ vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT;
+ vmsa->rip = (u64)secondary_startup_64_no_verify;
+ vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE];
+
+ /*
+ * Set the SNP-specific fields for this VMSA:
+ * VMPL level
+ * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits)
+ */
+ vmsa->vmpl = 0;
+ vmsa->sev_features = sev_status >> 2;
+
+ ret = snp_set_vmsa(vmsa, true);
+ if (!ret) {
+ pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret);
+ free_page((u64)vmsa);
+ return ret;
+ }
+
+ local_irq_save(flags);
+ start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg;
+ memset(start_vp_input, 0, sizeof(*start_vp_input));
+ start_vp_input->partition_id = -1;
+ start_vp_input->vp_index = cpu;
+ start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl;
+ *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1;
+
+ do {
+ ret = hv_do_hypercall(HVCALL_START_VP,
+ start_vp_input, NULL);
+ } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--);
+
+ local_irq_restore(flags);
+
+ if (!hv_result_success(ret)) {
+ pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret);
+ snp_cleanup_vmsa(vmsa);
+ vmsa = NULL;
+ }
+
+ cur_vmsa = per_cpu(hv_sev_vmsa, cpu);
+ /* Free up any previous VMSA page */
+ if (cur_vmsa)
+ snp_cleanup_vmsa(cur_vmsa);
+
+ /* Record the current VMSA page */
+ per_cpu(hv_sev_vmsa, cpu) = vmsa;
+
+ return ret;
+}
+
+#else
+static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
+static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
+#ifdef CONFIG_INTEL_TDX_GUEST
+static void hv_tdx_msr_write(u64 msr, u64 val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_WRITE,
+ .r12 = msr,
+ .r13 = val,
+ };
+
+ u64 ret = __tdx_hypercall(&args);
+
+ WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret);
+}
+
+static void hv_tdx_msr_read(u64 msr, u64 *val)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = EXIT_REASON_MSR_READ,
+ .r12 = msr,
+ };
+
+ u64 ret = __tdx_hypercall_ret(&args);
+
+ if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret))
+ *val = 0;
+ else
+ *val = args.r11;
+}
+
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ struct tdx_hypercall_args args = { };
+
+ args.r10 = control;
+ args.rdx = param1;
+ args.r8 = param2;
+
+ (void)__tdx_hypercall_ret(&args);
+
+ return args.r11;
+}
+
+#else
+static inline void hv_tdx_msr_write(u64 msr, u64 value) {}
+static inline void hv_tdx_msr_read(u64 msr, u64 *value) {}
+#endif /* CONFIG_INTEL_TDX_GUEST */
+
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_ivm_msr_write(u64 msr, u64 value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_write(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_write(msr, value);
+}
+
+void hv_ivm_msr_read(u64 msr, u64 *value)
+{
+ if (!ms_hyperv.paravisor_present)
+ return;
+
+ if (hv_isolation_type_tdx())
+ hv_tdx_msr_read(msr, value);
+ else if (hv_isolation_type_snp())
+ hv_ghcb_msr_read(msr, value);
+}
/*
* hv_mark_gpa_visibility - Set pages visible to host via hvcall.
@@ -358,13 +574,34 @@ static bool hv_is_private_mmio(u64 addr)
void __init hv_vtom_init(void)
{
+ enum hv_isolation_type type = hv_get_isolation_type();
+
+ switch (type) {
+ case HV_ISOLATION_TYPE_VBS:
+ fallthrough;
/*
* By design, a VM using vTOM doesn't see the SEV setting,
* so SEV initialization is bypassed and sev_status isn't set.
* Set it here to indicate a vTOM VM.
+ *
+ * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is
+ * defined as 0ULL, to which we can't assigned a value.
*/
- sev_status = MSR_AMD64_SNP_VTOM;
- cc_vendor = CC_VENDOR_AMD;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ case HV_ISOLATION_TYPE_SNP:
+ sev_status = MSR_AMD64_SNP_VTOM;
+ cc_vendor = CC_VENDOR_AMD;
+ break;
+#endif
+
+ case HV_ISOLATION_TYPE_TDX:
+ cc_vendor = CC_VENDOR_INTEL;
+ break;
+
+ default:
+ panic("hv_vtom_init: unsupported isolation type %d\n", type);
+ }
+
cc_set_mask(ms_hyperv.shared_gpa_boundary);
physical_mask &= ms_hyperv.shared_gpa_boundary - 1;
@@ -377,7 +614,7 @@ void __init hv_vtom_init(void)
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
}
-#endif /* CONFIG_AMD_MEM_ENCRYPT */
+#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
enum hv_isolation_type hv_get_isolation_type(void)
{
@@ -405,10 +642,20 @@ bool hv_is_isolation_supported(void)
DEFINE_STATIC_KEY_FALSE(isolation_type_snp);
/*
- * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based
+ * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based
* isolation VM.
*/
bool hv_isolation_type_snp(void)
{
return static_branch_unlikely(&isolation_type_snp);
}
+
+DEFINE_STATIC_KEY_FALSE(isolation_type_tdx);
+/*
+ * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based
+ * isolated VM.
+ */
+bool hv_isolation_type_tdx(void)
+{
+ return static_branch_unlikely(&isolation_type_tdx);
+}
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index cea95dcd27c2..2ff26f53cd62 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -169,7 +169,8 @@
enum hv_isolation_type {
HV_ISOLATION_TYPE_NONE = 0,
HV_ISOLATION_TYPE_VBS = 1,
- HV_ISOLATION_TYPE_SNP = 2
+ HV_ISOLATION_TYPE_SNP = 2,
+ HV_ISOLATION_TYPE_TDX = 3
};
/* Hyper-V specific model specific registers (MSRs) */
@@ -301,6 +302,13 @@ enum hv_isolation_type {
#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+/*
+ * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and
+ * there is not associated MSR address.
+ */
+#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003
+#define HV_X64_VTL_MASK GENMASK(3, 0)
+
/* Hyper-V memory host visibility */
enum hv_mem_host_visibility {
VMBUS_PAGE_NOT_VISIBLE = 0,
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index fa83d88e4c99..033b53f993c6 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -26,6 +26,7 @@
union hv_ghcb;
DECLARE_STATIC_KEY_FALSE(isolation_type_snp);
+DECLARE_STATIC_KEY_FALSE(isolation_type_tdx);
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
@@ -40,6 +41,7 @@ static inline unsigned char hv_get_nmi_reason(void)
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
+extern bool hyperv_paravisor_present;
extern void *hv_hypercall_pg;
@@ -47,10 +49,25 @@ extern u64 hv_current_partition_id;
extern union hv_ghcb * __percpu *hv_ghcb_pg;
+bool hv_isolation_type_snp(void);
+bool hv_isolation_type_tdx(void);
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
+
+/*
+ * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA
+ * to start AP in enlightened SEV guest.
+ */
+#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL
+#define HV_AP_SEGMENT_LIMIT 0xffffffff
+
int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages);
int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id);
int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags);
+/*
+ * If the hypercall involves no input or output parameters, the hypervisor
+ * ignores the corresponding GPA pointer.
+ */
static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
{
u64 input_address = input ? virt_to_phys(input) : 0;
@@ -58,6 +75,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
u64 hv_status;
#ifdef CONFIG_X86_64
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input_address, output_address);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input_address)
+ : "r" (output_address)
+ : "cc", "memory", "r8", "r9", "r10", "r11");
+ return hv_status;
+ }
+
if (!hv_hypercall_pg)
return U64_MAX;
@@ -101,7 +131,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, 0);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__(
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ :: "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__(CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
@@ -146,7 +185,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2)
u64 hv_status;
#ifdef CONFIG_X86_64
- {
+ if (hv_isolation_type_tdx() && !hyperv_paravisor_present)
+ return hv_tdx_hypercall(control, input1, input2);
+
+ if (hv_isolation_type_snp() && !hyperv_paravisor_present) {
+ __asm__ __volatile__("mov %4, %%r8\n"
+ "vmmcall"
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
+ "+c" (control), "+d" (input1)
+ : "r" (input2)
+ : "cc", "r8", "r9", "r10", "r11");
+ } else {
__asm__ __volatile__("mov %4, %%r8\n"
CALL_NOSPEC
: "=a" (hv_status), ASM_CALL_CONSTRAINT,
@@ -225,20 +274,24 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector,
int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#ifdef CONFIG_AMD_MEM_ENCRYPT
-void hv_ghcb_msr_write(u64 msr, u64 value);
-void hv_ghcb_msr_read(u64 msr, u64 *value);
bool hv_ghcb_negotiate_protocol(void);
void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason);
-void hv_vtom_init(void);
+int hv_snp_boot_ap(int cpu, unsigned long start_ip);
#else
-static inline void hv_ghcb_msr_write(u64 msr, u64 value) {}
-static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {}
static inline bool hv_ghcb_negotiate_protocol(void) { return false; }
static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {}
-static inline void hv_vtom_init(void) {}
+static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; }
#endif
-extern bool hv_isolation_type_snp(void);
+#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST)
+void hv_vtom_init(void);
+void hv_ivm_msr_write(u64 msr, u64 value);
+void hv_ivm_msr_read(u64 msr, u64 *value);
+#else
+static inline void hv_vtom_init(void) {}
+static inline void hv_ivm_msr_write(u64 msr, u64 value) {}
+static inline void hv_ivm_msr_read(u64 msr, u64 *value) {}
+#endif
static inline bool hv_is_synic_reg(unsigned int reg)
{
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0100468e72ca..e6bba12c759c 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -32,6 +32,7 @@
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
+#include <asm/svm.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
@@ -39,6 +40,10 @@ bool hv_root_partition;
bool hv_nested;
struct ms_hyperv_info ms_hyperv;
+/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */
+bool hyperv_paravisor_present __ro_after_init;
+EXPORT_SYMBOL_GPL(hyperv_paravisor_present);
+
#if IS_ENABLED(CONFIG_HYPERV)
static inline unsigned int hv_get_nested_reg(unsigned int reg)
{
@@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg)
{
u64 value;
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp())
- hv_ghcb_msr_read(reg, &value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present)
+ hv_ivm_msr_read(reg, &value);
else
rdmsrl(reg, value);
return value;
@@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register);
void hv_set_non_nested_register(unsigned int reg, u64 value)
{
- if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) {
- hv_ghcb_msr_write(reg, value);
+ if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) {
+ hv_ivm_msr_write(reg, value);
/* Write proxy bit via wrmsl instruction */
if (hv_is_sint_reg(reg))
@@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
native_smp_prepare_cpus(max_cpus);
+ /*
+ * Override wakeup_secondary_cpu_64 callback for SEV-SNP
+ * enlightened guest.
+ */
+ if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) {
+ apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap;
+ return;
+ }
+
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
@@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
}
#endif
+/*
+ * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the
+ * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0
+ * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns
+ * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a
+ * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and
+ * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs
+ * from the array and hence doesn't create the necessary irq description info.
+ *
+ * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here,
+ * except don't change 'legacy_pic', which keeps its default value
+ * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero
+ * nr_legacy_irqs() and eventually serial console interrupts works properly.
+ */
+static void __init reduced_hw_init(void)
+{
+ x86_init.timers.timer_init = x86_init_noop;
+ x86_init.irqs.pre_vector_init = x86_init_noop;
+}
+
static void __init ms_hyperv_init_platform(void)
{
int hv_max_functions_eax;
@@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.shared_gpa_boundary =
BIT_ULL(ms_hyperv.shared_gpa_boundary_bits);
+ hyperv_paravisor_present = !!ms_hyperv.paravisor_present;
+
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
- if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)
+
+ if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) {
static_branch_enable(&isolation_type_snp);
+ } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) {
+ static_branch_enable(&isolation_type_tdx);
+
+ /* A TDX VM must use x2APIC and doesn't use lazy EOI. */
+ ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED;
+
+ if (!ms_hyperv.paravisor_present) {
+ /* To be supported: more work is required. */
+ ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE;
+
+ /* HV_REGISTER_CRASH_CTL is unsupported. */
+ ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE;
+
+ /* Don't trust Hyper-V's TLB-flushing hypercalls. */
+ ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
+
+ x86_init.acpi.reduced_hw_early_init = reduced_hw_init;
+ }
+ }
}
if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) {
@@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void)
#if IS_ENABLED(CONFIG_HYPERV)
if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) ||
- (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP))
+ ms_hyperv.paravisor_present)
hv_vtom_init();
/*
* Setup the hook to get control post apic initialization.
@@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
- if (hv_root_partition)
+ if (hv_root_partition ||
+ (!ms_hyperv.paravisor_present && hv_isolation_type_snp()))
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
@@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void)
return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_r8(ghcb, regs->r8);
+}
+
+static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
@@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init.x2apic_available = ms_hyperv_x2apic_available,
.init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish,
+#endif
};
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index e56307a81f4d..8ff7cd4e20bb 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -390,7 +390,7 @@ static __always_inline u64 read_hv_clock_msr(void)
static union {
struct ms_hyperv_tsc_page page;
u8 reserved[PAGE_SIZE];
-} tsc_pg __aligned(PAGE_SIZE);
+} tsc_pg __bss_decrypted __aligned(PAGE_SIZE);
static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page;
static unsigned long tsc_pfn;
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index ebf15f31d97e..3cabeeabb1ca 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -98,6 +98,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
*/
if (version >= VERSION_WIN10_V5) {
msg->msg_sint = VMBUS_MESSAGE_SINT;
+ msg->msg_vtl = ms_hyperv.vtl;
vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4;
} else {
msg->interrupt_page = virt_to_phys(vmbus_connection.int_page);
@@ -482,10 +483,17 @@ void vmbus_set_event(struct vmbus_channel *channel)
++channel->sig_events;
- if (hv_isolation_type_snp())
- hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
- NULL, sizeof(channel->sig_event));
- else
+ if (ms_hyperv.paravisor_present) {
+ if (hv_isolation_type_snp())
+ hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event,
+ NULL, sizeof(channel->sig_event));
+ else if (hv_isolation_type_tdx())
+ hv_tdx_hypercall(HVCALL_SIGNAL_EVENT | HV_HYPERCALL_FAST_BIT,
+ channel->sig_event, 0);
+ else
+ WARN_ON_ONCE(1);
+ } else {
hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event);
+ }
}
EXPORT_SYMBOL_GPL(vmbus_set_event);
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index de6708dbe0df..51e5018ac9b2 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -20,6 +20,7 @@
#include <linux/interrupt.h>
#include <clocksource/hyperv_timer.h>
#include <asm/mshyperv.h>
+#include <linux/set_memory.h>
#include "hyperv_vmbus.h"
/* The one and only */
@@ -56,20 +57,37 @@ int hv_post_message(union hv_connection_id connection_id,
local_irq_save(flags);
- aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
+ /*
+ * A TDX VM with the paravisor must use the decrypted post_msg_page: see
+ * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor
+ * can use the encrypted hyperv_pcpu_input_arg because it copies the
+ * input into the GHCB page, which has been decrypted by the paravisor.
+ */
+ if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present)
+ aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page;
+ else
+ aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
aligned_msg->connectionid = connection_id;
aligned_msg->reserved = 0;
aligned_msg->message_type = message_type;
aligned_msg->payload_size = payload_size;
memcpy((void *)aligned_msg->payload, payload, payload_size);
- if (hv_isolation_type_snp())
- status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
- (void *)aligned_msg, NULL,
- sizeof(*aligned_msg));
- else
+ if (ms_hyperv.paravisor_present) {
+ if (hv_isolation_type_tdx())
+ status = hv_tdx_hypercall(HVCALL_POST_MESSAGE,
+ virt_to_phys(aligned_msg), 0);
+ else if (hv_isolation_type_snp())
+ status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE,
+ aligned_msg, NULL,
+ sizeof(*aligned_msg));
+ else
+ status = HV_STATUS_INVALID_PARAMETER;
+ } else {
status = hv_do_hypercall(HVCALL_POST_MESSAGE,
aligned_msg, NULL);
+ }
local_irq_restore(flags);
@@ -78,7 +96,7 @@ int hv_post_message(union hv_connection_id connection_id,
int hv_synic_alloc(void)
{
- int cpu;
+ int cpu, ret = -ENOMEM;
struct hv_per_cpu_context *hv_cpu;
/*
@@ -104,11 +122,29 @@ int hv_synic_alloc(void)
tasklet_init(&hv_cpu->msg_dpc,
vmbus_on_msg_dpc, (unsigned long) hv_cpu);
+ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
+ hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC);
+ if (hv_cpu->post_msg_page == NULL) {
+ pr_err("Unable to allocate post msg page\n");
+ goto err;
+ }
+
+ ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt post msg page: %d\n", ret);
+ /* Just leak the page, as it's unsafe to free the page. */
+ hv_cpu->post_msg_page = NULL;
+ goto err;
+ }
+
+ memset(hv_cpu->post_msg_page, 0, PAGE_SIZE);
+ }
+
/*
* Synic message and event pages are allocated by paravisor.
* Skip these pages allocation here.
*/
- if (!hv_isolation_type_snp() && !hv_root_partition) {
+ if (!ms_hyperv.paravisor_present && !hv_root_partition) {
hv_cpu->synic_message_page =
(void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->synic_message_page == NULL) {
@@ -120,29 +156,96 @@ int hv_synic_alloc(void)
(void *)get_zeroed_page(GFP_ATOMIC);
if (hv_cpu->synic_event_page == NULL) {
pr_err("Unable to allocate SYNIC event page\n");
+
+ free_page((unsigned long)hv_cpu->synic_message_page);
+ hv_cpu->synic_message_page = NULL;
goto err;
}
}
+
+ if (!ms_hyperv.paravisor_present &&
+ (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
+ ret = set_memory_decrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC msg page: %d\n", ret);
+ hv_cpu->synic_message_page = NULL;
+
+ /*
+ * Free the event page here so that hv_synic_free()
+ * won't later try to re-encrypt it.
+ */
+ free_page((unsigned long)hv_cpu->synic_event_page);
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ ret = set_memory_decrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to decrypt SYNIC event page: %d\n", ret);
+ hv_cpu->synic_event_page = NULL;
+ goto err;
+ }
+
+ memset(hv_cpu->synic_message_page, 0, PAGE_SIZE);
+ memset(hv_cpu->synic_event_page, 0, PAGE_SIZE);
+ }
}
return 0;
+
err:
/*
* Any memory allocations that succeeded will be freed when
* the caller cleans up by calling hv_synic_free()
*/
- return -ENOMEM;
+ return ret;
}
void hv_synic_free(void)
{
- int cpu;
+ int cpu, ret;
for_each_present_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
= per_cpu_ptr(hv_context.cpu_context, cpu);
+ /* It's better to leak the page if the encryption fails. */
+ if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) {
+ if (hv_cpu->post_msg_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->post_msg_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt post msg page: %d\n", ret);
+ hv_cpu->post_msg_page = NULL;
+ }
+ }
+ }
+
+ if (!ms_hyperv.paravisor_present &&
+ (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
+ if (hv_cpu->synic_message_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_message_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC msg page: %d\n", ret);
+ hv_cpu->synic_message_page = NULL;
+ }
+ }
+
+ if (hv_cpu->synic_event_page) {
+ ret = set_memory_encrypted((unsigned long)
+ hv_cpu->synic_event_page, 1);
+ if (ret) {
+ pr_err("Failed to encrypt SYNIC event page: %d\n", ret);
+ hv_cpu->synic_event_page = NULL;
+ }
+ }
+ }
+
+ free_page((unsigned long)hv_cpu->post_msg_page);
free_page((unsigned long)hv_cpu->synic_event_page);
free_page((unsigned long)hv_cpu->synic_message_page);
}
@@ -170,7 +273,7 @@ void hv_synic_enable_regs(unsigned int cpu)
simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP);
simp.simp_enabled = 1;
- if (hv_isolation_type_snp() || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@@ -189,7 +292,7 @@ void hv_synic_enable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
siefp.siefp_enabled = 1;
- if (hv_isolation_type_snp() || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition) {
/* Mask out vTOM bit. ioremap_cache() maps decrypted */
u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) &
~ms_hyperv.shared_gpa_boundary;
@@ -272,7 +375,7 @@ void hv_synic_disable_regs(unsigned int cpu)
* addresses.
*/
simp.simp_enabled = 0;
- if (hv_isolation_type_snp() || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition) {
iounmap(hv_cpu->synic_message_page);
hv_cpu->synic_message_page = NULL;
} else {
@@ -284,7 +387,7 @@ void hv_synic_disable_regs(unsigned int cpu)
siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP);
siefp.siefp_enabled = 0;
- if (hv_isolation_type_snp() || hv_root_partition) {
+ if (ms_hyperv.paravisor_present || hv_root_partition) {
iounmap(hv_cpu->synic_event_page);
hv_cpu->synic_event_page = NULL;
} else {
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 0d7a3ba66396..e000fa3b9f97 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/cleanup.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mman.h>
@@ -646,7 +647,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
void *v)
{
struct memory_notify *mem = (struct memory_notify *)v;
- unsigned long flags, pfn_count;
+ unsigned long pfn_count;
switch (val) {
case MEM_ONLINE:
@@ -655,21 +656,22 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
break;
case MEM_OFFLINE:
- spin_lock_irqsave(&dm_device.ha_lock, flags);
- pfn_count = hv_page_offline_check(mem->start_pfn,
- mem->nr_pages);
- if (pfn_count <= dm_device.num_pages_onlined) {
- dm_device.num_pages_onlined -= pfn_count;
- } else {
- /*
- * We're offlining more pages than we managed to online.
- * This is unexpected. In any case don't let
- * num_pages_onlined wrap around zero.
- */
- WARN_ON_ONCE(1);
- dm_device.num_pages_onlined = 0;
+ scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+ pfn_count = hv_page_offline_check(mem->start_pfn,
+ mem->nr_pages);
+ if (pfn_count <= dm_device.num_pages_onlined) {
+ dm_device.num_pages_onlined -= pfn_count;
+ } else {
+ /*
+ * We're offlining more pages than we
+ * managed to online. This is
+ * unexpected. In any case don't let
+ * num_pages_onlined wrap around zero.
+ */
+ WARN_ON_ONCE(1);
+ dm_device.num_pages_onlined = 0;
+ }
}
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
break;
case MEM_GOING_ONLINE:
case MEM_GOING_OFFLINE:
@@ -721,24 +723,23 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
unsigned long start_pfn;
unsigned long processed_pfn;
unsigned long total_pfn = pfn_count;
- unsigned long flags;
for (i = 0; i < (size/HA_CHUNK); i++) {
start_pfn = start + (i * HA_CHUNK);
- spin_lock_irqsave(&dm_device.ha_lock, flags);
- has->ha_end_pfn += HA_CHUNK;
+ scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+ has->ha_end_pfn += HA_CHUNK;
- if (total_pfn > HA_CHUNK) {
- processed_pfn = HA_CHUNK;
- total_pfn -= HA_CHUNK;
- } else {
- processed_pfn = total_pfn;
- total_pfn = 0;
- }
+ if (total_pfn > HA_CHUNK) {
+ processed_pfn = HA_CHUNK;
+ total_pfn -= HA_CHUNK;
+ } else {
+ processed_pfn = total_pfn;
+ total_pfn = 0;
+ }
- has->covered_end_pfn += processed_pfn;
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+ has->covered_end_pfn += processed_pfn;
+ }
reinit_completion(&dm_device.ol_waitevent);
@@ -758,10 +759,10 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
*/
do_hot_add = false;
}
- spin_lock_irqsave(&dm_device.ha_lock, flags);
- has->ha_end_pfn -= HA_CHUNK;
- has->covered_end_pfn -= processed_pfn;
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+ scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+ has->ha_end_pfn -= HA_CHUNK;
+ has->covered_end_pfn -= processed_pfn;
+ }
break;
}
@@ -781,10 +782,9 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
static void hv_online_page(struct page *pg, unsigned int order)
{
struct hv_hotadd_state *has;
- unsigned long flags;
unsigned long pfn = page_to_pfn(pg);
- spin_lock_irqsave(&dm_device.ha_lock, flags);
+ guard(spinlock_irqsave)(&dm_device.ha_lock);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
/* The page belongs to a different HAS. */
if ((pfn < has->start_pfn) ||
@@ -794,7 +794,6 @@ static void hv_online_page(struct page *pg, unsigned int order)
hv_bring_pgs_online(has, pfn, 1UL << order);
break;
}
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
}
static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
@@ -803,9 +802,8 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
struct hv_hotadd_gap *gap;
unsigned long residual, new_inc;
int ret = 0;
- unsigned long flags;
- spin_lock_irqsave(&dm_device.ha_lock, flags);
+ guard(spinlock_irqsave)(&dm_device.ha_lock);
list_for_each_entry(has, &dm_device.ha_region_list, list) {
/*
* If the pfn range we are dealing with is not in the current
@@ -852,7 +850,6 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
ret = 1;
break;
}
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
return ret;
}
@@ -947,7 +944,6 @@ static unsigned long process_hot_add(unsigned long pg_start,
{
struct hv_hotadd_state *ha_region = NULL;
int covered;
- unsigned long flags;
if (pfn_cnt == 0)
return 0;
@@ -979,9 +975,9 @@ static unsigned long process_hot_add(unsigned long pg_start,
ha_region->covered_end_pfn = pg_start;
ha_region->end_pfn = rg_start + rg_size;
- spin_lock_irqsave(&dm_device.ha_lock, flags);
- list_add_tail(&ha_region->list, &dm_device.ha_region_list);
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
+ scoped_guard(spinlock_irqsave, &dm_device.ha_lock) {
+ list_add_tail(&ha_region->list, &dm_device.ha_region_list);
+ }
}
do_pg_range:
@@ -2047,7 +2043,6 @@ static void balloon_remove(struct hv_device *dev)
struct hv_dynmem_device *dm = hv_get_drvdata(dev);
struct hv_hotadd_state *has, *tmp;
struct hv_hotadd_gap *gap, *tmp_gap;
- unsigned long flags;
if (dm->num_pages_ballooned != 0)
pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
@@ -2073,7 +2068,7 @@ static void balloon_remove(struct hv_device *dev)
#endif
}
- spin_lock_irqsave(&dm_device.ha_lock, flags);
+ guard(spinlock_irqsave)(&dm_device.ha_lock);
list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
list_del(&gap->list);
@@ -2082,7 +2077,6 @@ static void balloon_remove(struct hv_device *dev)
list_del(&has->list);
kfree(has);
}
- spin_unlock_irqrestore(&dm_device.ha_lock, flags);
}
static int balloon_suspend(struct hv_device *hv_dev)
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index 6a2258fef1fe..ccad7bca3fd3 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -24,6 +24,7 @@
#include <linux/kmsg_dump.h>
#include <linux/slab.h>
#include <linux/dma-map-ops.h>
+#include <linux/set_memory.h>
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
@@ -359,6 +360,8 @@ int hv_common_cpu_init(unsigned int cpu)
u64 msr_vp_index;
gfp_t flags;
int pgcount = hv_root_partition ? 2 : 1;
+ void *mem;
+ int ret;
/* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL;
@@ -370,14 +373,41 @@ int hv_common_cpu_init(unsigned int cpu)
* allocated if this CPU was previously online and then taken offline
*/
if (!*inputarg) {
- *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
- if (!(*inputarg))
+ mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags);
+ if (!mem)
return -ENOMEM;
if (hv_root_partition) {
outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
- *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE;
+ *outputarg = (char *)mem + HV_HYP_PAGE_SIZE;
+ }
+
+ if (!ms_hyperv.paravisor_present &&
+ (hv_isolation_type_snp() || hv_isolation_type_tdx())) {
+ ret = set_memory_decrypted((unsigned long)mem, pgcount);
+ if (ret) {
+ /* It may be unsafe to free 'mem' */
+ return ret;
+ }
+
+ memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE);
}
+
+ /*
+ * In a fully enlightened TDX/SNP VM with more than 64 VPs, if
+ * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() ->
+ * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to
+ * use hyperv_pcpu_input_arg as the hypercall input page, which
+ * must be a decrypted page in such a VM, but the page is still
+ * encrypted before set_memory_decrypted() returns. Fix this by
+ * setting *inputarg after the above set_memory_decrypted(): if
+ * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns
+ * HV_STATUS_INVALID_PARAMETER immediately, and the function
+ * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(),
+ * which may be slightly slower than the hypercall, but still
+ * works correctly in such a VM.
+ */
+ *inputarg = mem;
}
msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
@@ -502,6 +532,12 @@ bool __weak hv_isolation_type_snp(void)
}
EXPORT_SYMBOL_GPL(hv_isolation_type_snp);
+bool __weak hv_isolation_type_tdx(void)
+{
+ return false;
+}
+EXPORT_SYMBOL_GPL(hv_isolation_type_tdx);
+
void __weak hv_setup_vmbus_handler(void (*handler)(void))
{
}
@@ -542,3 +578,9 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s
return HV_STATUS_INVALID_PARAMETER;
}
EXPORT_SYMBOL_GPL(hv_ghcb_hypercall);
+
+u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2)
+{
+ return HV_STATUS_INVALID_PARAMETER;
+}
+EXPORT_SYMBOL_GPL(hv_tdx_hypercall);
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 55f2086841ae..f6b1e710f805 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -124,6 +124,17 @@ struct hv_per_cpu_context {
void *synic_event_page;
/*
+ * The page is only used in hv_post_message() for a TDX VM (with the
+ * paravisor) to post a messages to Hyper-V: when such a VM calls
+ * HVCALL_POST_MESSAGE, it can't use the hyperv_pcpu_input_arg (which
+ * is encrypted in such a VM) as the hypercall input page, because
+ * the input page for HVCALL_POST_MESSAGE must be decrypted in such a
+ * VM, so post_msg_page (which is decrypted in hv_synic_alloc()) is
+ * introduced for this purpose. See hyperv_init() for more comments.
+ */
+ void *post_msg_page;
+
+ /*
* Starting with win8, we can take channel interrupts on any CPU;
* we will manage the tasklet that handles events messages on a per CPU
* basis.
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 67f95a29aeca..edbb38f6956b 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2287,7 +2287,8 @@ static int vmbus_acpi_add(struct platform_device *pdev)
* Some ancestor of the vmbus acpi device (Gen1 or Gen2
* firmware) is the VMOD that has the mmio ranges. Get that.
*/
- for (ancestor = acpi_dev_parent(device); ancestor;
+ for (ancestor = acpi_dev_parent(device);
+ ancestor && ancestor->handle != ACPI_ROOT_OBJECT;
ancestor = acpi_dev_parent(ancestor)) {
result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
vmbus_walk_resources, NULL);
diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h
index f4e4cc4f965f..fdac4a1714ec 100644
--- a/include/asm-generic/hyperv-tlfs.h
+++ b/include/asm-generic/hyperv-tlfs.h
@@ -223,6 +223,7 @@ enum HV_GENERIC_SET_FORMAT {
#define HV_STATUS_INVALID_PORT_ID 17
#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
+#define HV_STATUS_TIME_OUT 120
#define HV_STATUS_VTL_ALREADY_ENABLED 134
/*
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
index a8f4b653ef4e..cecd2b7bd033 100644
--- a/include/asm-generic/mshyperv.h
+++ b/include/asm-generic/mshyperv.h
@@ -36,18 +36,25 @@ struct ms_hyperv_info {
u32 nested_features;
u32 max_vp_index;
u32 max_lp_index;
- u32 isolation_config_a;
+ union {
+ u32 isolation_config_a;
+ struct {
+ u32 paravisor_present : 1;
+ u32 reserved_a1 : 31;
+ };
+ };
union {
u32 isolation_config_b;
struct {
u32 cvm_type : 4;
- u32 reserved1 : 1;
+ u32 reserved_b1 : 1;
u32 shared_gpa_boundary_active : 1;
u32 shared_gpa_boundary_bits : 6;
- u32 reserved2 : 20;
+ u32 reserved_b2 : 20;
};
};
u64 shared_gpa_boundary;
+ u8 vtl;
};
extern struct ms_hyperv_info ms_hyperv;
extern bool hv_nested;
@@ -57,7 +64,8 @@ extern void * __percpu *hyperv_pcpu_output_arg;
extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr);
extern u64 hv_do_fast_hypercall8(u16 control, u64 input8);
-extern bool hv_isolation_type_snp(void);
+bool hv_isolation_type_snp(void);
+bool hv_isolation_type_tdx(void);
/* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */
static inline int hv_result(u64 status)
@@ -274,6 +282,7 @@ enum hv_isolation_type hv_get_isolation_type(void);
bool hv_is_isolation_supported(void);
bool hv_isolation_type_snp(void);
u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size);
+u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2);
void hyperv_cleanup(void);
bool hv_query_ext_cap(u64 cap_query);
void hv_setup_dma_ops(struct device *dev, bool coherent);
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 3ac3974b3c78..2b00faf98017 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -348,7 +348,7 @@ struct vmtransfer_page_packet_header {
u8 sender_owns_set;
u8 reserved;
u32 range_cnt;
- struct vmtransfer_page_range ranges[1];
+ struct vmtransfer_page_range ranges[];
} __packed;
struct vmgpadl_packet_header {
@@ -665,8 +665,8 @@ struct vmbus_channel_initiate_contact {
u64 interrupt_page;
struct {
u8 msg_sint;
- u8 padding1[3];
- u32 padding2;
+ u8 msg_vtl;
+ u8 reserved[6];
};
};
u64 monitor_page1;