summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h10
-rw-r--r--arch/x86/kvm/vmx/common.h182
-rw-r--r--arch/x86/kvm/vmx/hyperv.c1
-rw-r--r--arch/x86/kvm/vmx/hyperv.h2
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h2
-rw-r--r--arch/x86/kvm/vmx/main.c1113
-rw-r--r--arch/x86/kvm/vmx/nested.c480
-rw-r--r--arch/x86/kvm/vmx/nested.h30
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c115
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.h28
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c97
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h101
-rw-r--r--arch/x86/kvm/vmx/sgx.c30
-rw-r--r--arch/x86/kvm/vmx/tdx.c3526
-rw-r--r--arch/x86/kvm/vmx/tdx.h204
-rw-r--r--arch/x86/kvm/vmx/tdx_arch.h167
-rw-r--r--arch/x86/kvm/vmx/tdx_errno.h40
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h14
-rw-r--r--arch/x86/kvm/vmx/vmx.c1570
-rw-r--r--arch/x86/kvm/vmx/vmx.h171
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h10
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h36
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h233
24 files changed, 6859 insertions, 1308 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 41a4533f9989..cb6588238f46 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -54,9 +54,7 @@ struct nested_vmx_msrs {
};
struct vmcs_config {
- int size;
- u32 basic_cap;
- u32 revision_id;
+ u64 basic;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
@@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
- return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ return vmcs_config.basic & VMX_BASIC_INOUT;
}
static inline bool cpu_has_virtual_nmis(void)
@@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void)
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
/* check if the cpu supports writing r/o exit information fields */
- if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void)
static inline bool cpu_has_vmx_intel_pt(void)
{
- return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
new file mode 100644
index 000000000000..8f46a06e2c44
--- /dev/null
+++ b/arch/x86/kvm/vmx/common.h
@@ -0,0 +1,182 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __KVM_X86_VMX_COMMON_H
+#define __KVM_X86_VMX_COMMON_H
+
+#include <linux/kvm_host.h>
+#include <asm/posted_intr.h>
+
+#include "mmu.h"
+
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 bus_lock_detected : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
+struct vcpu_vt {
+ /* Posted interrupt descriptor */
+ struct pi_desc pi_desc;
+
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
+ union vmx_exit_reason exit_reason;
+
+ unsigned long exit_qualification;
+ u32 exit_intr_info;
+
+ /*
+ * If true, guest state has been loaded into hardware, and host state
+ * saved into vcpu_{vt,vmx,tdx}. If false, host state is loaded into
+ * hardware.
+ */
+ bool guest_state_loaded;
+ bool emulation_required;
+
+#ifdef CONFIG_X86_64
+ u64 msr_host_kernel_gs_base;
+#endif
+
+ unsigned long host_debugctlmsr;
+};
+
+#ifdef CONFIG_KVM_INTEL_TDX
+
+static __always_inline bool is_td(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_TDX_VM;
+}
+
+static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
+{
+ return is_td(vcpu->kvm);
+}
+
+#else
+
+static inline bool is_td(struct kvm *kvm) { return false; }
+static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+
+#endif
+
+static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa)
+{
+ /* For TDX the direct mask is the shared mask. */
+ return !kvm_is_addr_direct(kvm, gpa);
+}
+
+static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
+ unsigned long exit_qualification)
+{
+ u64 error_code;
+
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+ ? PFERR_PRESENT_MASK : 0;
+
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+}
+
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
+{
+#ifdef CONFIG_SMP
+ if (vcpu->mode == IN_GUEST_MODE) {
+ /*
+ * The vector of the virtual has already been set in the PIR.
+ * Send a notification event to deliver the virtual interrupt
+ * unless the vCPU is the currently running vCPU, i.e. the
+ * event is being sent from a fastpath VM-Exit handler, in
+ * which case the PIR will be synced to the vIRR before
+ * re-entering the guest.
+ *
+ * When the target is not the running vCPU, the following
+ * possibilities emerge:
+ *
+ * Case 1: vCPU stays in non-root mode. Sending a notification
+ * event posts the interrupt to the vCPU.
+ *
+ * Case 2: vCPU exits to root mode and is still runnable. The
+ * PIR will be synced to the vIRR before re-entering the guest.
+ * Sending a notification event is ok as the host IRQ handler
+ * will ignore the spurious event.
+ *
+ * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+ * has already synced PIR to vIRR and never blocks the vCPU if
+ * the vIRR is not empty. Therefore, a blocked vCPU here does
+ * not wait for any requested interrupts in PIR, and sending a
+ * notification event also results in a benign, spurious event.
+ */
+
+ if (vcpu != kvm_get_running_vcpu())
+ __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+ return;
+ }
+#endif
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
+}
+
+/*
+ * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the
+ * interrupt if necessary.
+ */
+static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+ struct pi_desc *pi_desc, int vector)
+{
+ if (pi_test_and_set_pir(vector, pi_desc))
+ return;
+
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(pi_desc))
+ return;
+
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+}
+
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_COMMON_H */
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index fab6a1ad98dc..fa41d036acd4 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#include "x86.h"
#include "../cpuid.h"
#include "hyperv.h"
#include "nested.h"
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index a87407412615..11a339009781 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
return vmx->nested.hv_evmcs;
}
-static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
+static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu)
{
/*
* eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
index a543fccfc574..6536290f4274 100644
--- a/arch/x86/kvm/vmx/hyperv_evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -6,7 +6,7 @@
#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H
#define __KVM_X86_VMX_HYPERV_EVMCS_H
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include "capabilities.h"
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..94d5d907d37b
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,1113 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "mmu.h"
+#include "nested.h"
+#include "pmu.h"
+#include "posted_intr.h"
+#include "tdx.h"
+#include "tdx_arch.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+static_assert(offsetof(struct vcpu_vmx, vt) == offsetof(struct vcpu_tdx, vt));
+#endif
+
+static void vt_disable_virtualization_cpu(void)
+{
+ /* Note, TDX *and* VMX need to be disabled if TDX is enabled. */
+ if (enable_tdx)
+ tdx_disable_virtualization_cpu();
+ vmx_disable_virtualization_cpu();
+}
+
+static __init int vt_hardware_setup(void)
+{
+ int ret;
+
+ ret = vmx_hardware_setup();
+ if (ret)
+ return ret;
+
+ /*
+ * Update vt_x86_ops::vm_size here so it is ready before
+ * kvm_ops_update() is called in kvm_x86_vendor_init().
+ *
+ * Note, the actual bringing up of TDX must be done after
+ * kvm_ops_update() because enabling TDX requires enabling
+ * hardware virtualization first, i.e., all online CPUs must
+ * be in post-VMXON state. This means the @vm_size here
+ * may be updated to TDX's size but TDX may fail to enable
+ * at later time.
+ *
+ * The VMX/VT code could update kvm_x86_ops::vm_size again
+ * after bringing up TDX, but this would require exporting
+ * either kvm_x86_ops or kvm_ops_update() from the base KVM
+ * module, which looks overkill. Anyway, the worst case here
+ * is KVM may allocate couple of more bytes than needed for
+ * each VM.
+ */
+ if (enable_tdx) {
+ vt_x86_ops.vm_size = max_t(unsigned int, vt_x86_ops.vm_size,
+ sizeof(struct kvm_tdx));
+ /*
+ * Note, TDX may fail to initialize in a later time in
+ * vt_init(), in which case it is not necessary to setup
+ * those callbacks. But making them valid here even
+ * when TDX fails to init later is fine because those
+ * callbacks won't be called if the VM isn't TDX guest.
+ */
+ vt_x86_ops.link_external_spt = tdx_sept_link_private_spt;
+ vt_x86_ops.set_external_spte = tdx_sept_set_private_spte;
+ vt_x86_ops.free_external_spt = tdx_sept_free_private_spt;
+ vt_x86_ops.remove_external_spte = tdx_sept_remove_private_spte;
+ vt_x86_ops.protected_apic_has_interrupt = tdx_protected_apic_has_interrupt;
+ }
+
+ return 0;
+}
+
+static int vt_vm_init(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_init(kvm);
+
+ return vmx_vm_init(kvm);
+}
+
+static void vt_vm_pre_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return tdx_vm_destroy(kvm);
+
+ vmx_vm_destroy(kvm);
+}
+
+static int vt_vcpu_precreate(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_vcpu_precreate(kvm);
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_create(vcpu);
+
+ return vmx_vcpu_create(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_free(vcpu);
+ return;
+ }
+
+ vmx_vcpu_free(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_reset(vcpu, init_event);
+ return;
+ }
+
+ vmx_vcpu_reset(vcpu, init_event);
+}
+
+static void vt_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_load(vcpu, cpu);
+ return;
+ }
+
+ vmx_vcpu_load(vcpu, cpu);
+}
+
+static void vt_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Basic TDX does not support feature PML. KVM does not enable PML in
+ * TD's VMCS, nor does it allocate or flush PML buffer for TDX.
+ */
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_update_cpu_dirty_logging(vcpu);
+}
+
+static void vt_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_prepare_switch_to_guest(vcpu);
+ return;
+ }
+
+ vmx_prepare_switch_to_guest(vcpu);
+}
+
+static void vt_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_vcpu_put(vcpu);
+ return;
+ }
+
+ vmx_vcpu_put(vcpu);
+}
+
+static int vt_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_pre_run(vcpu);
+
+ return vmx_vcpu_pre_run(vcpu);
+}
+
+static fastpath_t vt_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_run(vcpu, force_immediate_exit);
+
+ return vmx_vcpu_run(vcpu, force_immediate_exit);
+}
+
+static int vt_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_handle_exit(vcpu, fastpath);
+
+ return vmx_handle_exit(vcpu, fastpath);
+}
+
+static int vt_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_set_msr(vcpu, msr_info);
+
+ return vmx_set_msr(vcpu, msr_info);
+}
+
+/*
+ * The kvm parameter can be NULL (module initialization, or invocation before
+ * VM creation). Be sure to check the kvm parameter before using it.
+ */
+static bool vt_has_emulated_msr(struct kvm *kvm, u32 index)
+{
+ if (kvm && is_td(kvm))
+ return tdx_has_emulated_msr(index);
+
+ return vmx_has_emulated_msr(kvm, index);
+}
+
+static int vt_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ if (unlikely(is_td_vcpu(vcpu)))
+ return tdx_get_msr(vcpu, msr_info);
+
+ return vmx_get_msr(vcpu, msr_info);
+}
+
+static void vt_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX doesn't allow VMM to configure interception of MSR accesses.
+ * TDX guest requests MSR accesses by calling TDVMCALL. The MSR
+ * filters will be applied when handling the TDVMCALL for RDMSR/WRMSR
+ * if the userspace has set any.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_msr_filter_changed(vcpu);
+}
+
+static int vt_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_complete_emulated_msr(vcpu, err);
+
+ return kvm_complete_insn_gp(vcpu, err);
+}
+
+#ifdef CONFIG_KVM_SMM
+static int vt_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_smi_allowed(vcpu, for_injection);
+}
+
+static int vt_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_enter_smm(vcpu, smram);
+}
+
+static int vt_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return 0;
+
+ return vmx_leave_smm(vcpu, smram);
+}
+
+static void vt_enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ if (KVM_BUG_ON(is_td_vcpu(vcpu), vcpu->kvm))
+ return;
+
+ /* RSM will cause a vmexit anyway. */
+ vmx_enable_smi_window(vcpu);
+}
+#endif
+
+static int vt_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ /*
+ * For TDX, this can only be triggered for MMIO emulation. Let the
+ * guest retry after installing the SPTE with suppress #VE bit cleared,
+ * so that the guest will receive #VE when retry. The guest is expected
+ * to call TDG.VP.VMCALL<MMIO> to request VMM to do MMIO emulation on
+ * #VE.
+ */
+ if (is_td_vcpu(vcpu))
+ return X86EMUL_RETRY_INSTR;
+
+ return vmx_check_emulate_instruction(vcpu, emul_type, insn, insn_len);
+}
+
+static bool vt_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ /*
+ * INIT and SIPI are always blocked for TDX, i.e., INIT handling and
+ * the OP vcpu_deliver_sipi_vector() won't be called.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_apic_init_signal_blocked(vcpu);
+}
+
+static void vt_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
+{
+ /* Only x2APIC mode is supported for TD. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_set_virtual_apic_mode(vcpu);
+}
+
+static void vt_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi = vcpu_to_pi_desc(vcpu);
+
+ pi_clear_on(pi);
+ memset(pi->pir, 0, sizeof(pi->pir));
+}
+
+static void vt_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ return vmx_hwapic_isr_update(vcpu, max_isr);
+}
+
+static int vt_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return -1;
+
+ return vmx_sync_pir_to_irr(vcpu);
+}
+
+static void vt_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ if (is_td_vcpu(apic->vcpu)) {
+ tdx_deliver_interrupt(apic, delivery_mode, trig_mode,
+ vector);
+ return;
+ }
+
+ vmx_deliver_interrupt(apic, delivery_mode, trig_mode, vector);
+}
+
+static void vt_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_vcpu_after_set_cpuid(vcpu);
+}
+
+static void vt_update_exception_bitmap(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_exception_bitmap(vcpu);
+}
+
+static u64 vt_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_segment_base(vcpu, seg);
+}
+
+static void vt_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(var, 0, sizeof(*var));
+ return;
+ }
+
+ vmx_get_segment(vcpu, var, seg);
+}
+
+static void vt_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var,
+ int seg)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_segment(vcpu, var, seg);
+}
+
+static int vt_get_cpl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl(vcpu);
+}
+
+static int vt_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_cpl_no_cache(vcpu);
+}
+
+static void vt_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+{
+ if (is_td_vcpu(vcpu)) {
+ *db = 0;
+ *l = 0;
+ return;
+ }
+
+ vmx_get_cs_db_l_bits(vcpu, db, l);
+}
+
+static bool vt_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr0(vcpu, cr0);
+}
+
+static void vt_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr0(vcpu, cr0);
+}
+
+static bool vt_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_is_valid_cr4(vcpu, cr4);
+}
+
+static void vt_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_cr4(vcpu, cr4);
+}
+
+static int vt_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_set_efer(vcpu, efer);
+}
+
+static void vt_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_idt(vcpu, dt);
+}
+
+static void vt_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_idt(vcpu, dt);
+}
+
+static void vt_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu)) {
+ memset(dt, 0, sizeof(*dt));
+ return;
+ }
+
+ vmx_get_gdt(vcpu, dt);
+}
+
+static void vt_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_gdt(vcpu, dt);
+}
+
+static void vt_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_dr6(vcpu, val);
+}
+
+static void vt_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_dr7(vcpu, val);
+}
+
+static void vt_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * MOV-DR exiting is always cleared for TD guest, even in debug mode.
+ * Thus KVM_DEBUGREG_WONT_EXIT can never be set and it should never
+ * reach here for TD vcpu.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_sync_dirty_debug_regs(vcpu);
+}
+
+static void vt_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+ if (WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ return;
+
+ vmx_cache_reg(vcpu, reg);
+}
+
+static unsigned long vt_get_rflags(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_rflags(vcpu);
+}
+
+static void vt_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_rflags(vcpu, rflags);
+}
+
+static bool vt_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_if_flag(vcpu);
+}
+
+static void vt_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_all(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_all(vcpu);
+}
+
+static void vt_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_flush_tlb_current(vcpu);
+ return;
+ }
+
+ vmx_flush_tlb_current(vcpu);
+}
+
+static void vt_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_gva(vcpu, addr);
+}
+
+static void vt_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_flush_tlb_guest(vcpu);
+}
+
+static void vt_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_inject_nmi(vcpu);
+ return;
+ }
+
+ vmx_inject_nmi(vcpu);
+}
+
+static int vt_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ /*
+ * The TDX module manages NMI windows and NMI reinjection, and hides NMI
+ * blocking, all KVM can do is throw an NMI over the wall.
+ */
+ if (is_td_vcpu(vcpu))
+ return true;
+
+ return vmx_nmi_allowed(vcpu, for_injection);
+}
+
+static bool vt_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get NMI blocking status for TDX guest, assume NMIs are
+ * always unmasked.
+ */
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return vmx_get_nmi_mask(vcpu);
+}
+
+static void vt_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_nmi_mask(vcpu, masked);
+}
+
+static void vt_enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ /* Refer to the comments in tdx_inject_nmi(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_nmi_window(vcpu);
+}
+
+static void vt_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
+ int pgd_level)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+ return;
+ }
+
+ vmx_load_mmu_pgd(vcpu, root_hpa, pgd_level);
+}
+
+static void vt_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_interrupt_shadow(vcpu, mask);
+}
+
+static u32 vt_get_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_interrupt_shadow(vcpu);
+}
+
+static void vt_patch_hypercall(struct kvm_vcpu *vcpu,
+ unsigned char *hypercall)
+{
+ /*
+ * Because guest memory is protected, guest can't be patched. TD kernel
+ * is modified to use TDG.VP.VMCALL for hypercall.
+ */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_patch_hypercall(vcpu, hypercall);
+}
+
+static void vt_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_irq(vcpu, reinjected);
+}
+
+static void vt_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_inject_exception(vcpu);
+}
+
+static void vt_cancel_injection(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_injection(vcpu);
+}
+
+static int vt_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_interrupt_allowed(vcpu);
+
+ return vmx_interrupt_allowed(vcpu, for_injection);
+}
+
+static void vt_enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_enable_irq_window(vcpu);
+}
+
+static void vt_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = 0;
+ *error_code = 0;
+
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_get_entry_info(vcpu, intr_info, error_code);
+}
+
+static void vt_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ if (is_td_vcpu(vcpu)) {
+ tdx_get_exit_info(vcpu, reason, info1, info2, intr_info,
+ error_code);
+ return;
+ }
+
+ vmx_get_exit_info(vcpu, reason, info1, info2, intr_info, error_code);
+}
+
+static void vt_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_update_cr8_intercept(vcpu, tpr, irr);
+}
+
+static void vt_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_set_apic_access_page_addr(vcpu);
+}
+
+static void vt_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu)) {
+ KVM_BUG_ON(!kvm_vcpu_apicv_active(vcpu), vcpu->kvm);
+ return;
+ }
+
+ vmx_refresh_apicv_exec_ctrl(vcpu);
+}
+
+static void vt_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
+
+static int vt_set_tss_addr(struct kvm *kvm, unsigned int addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_tss_addr(kvm, addr);
+}
+
+static int vt_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_set_identity_map_addr(kvm, ident_addr);
+}
+
+static u64 vt_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_offset(vcpu);
+}
+
+static u64 vt_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* TDX doesn't support L2 guest at the moment. */
+ if (is_td_vcpu(vcpu))
+ return 0;
+
+ return vmx_get_l2_tsc_multiplier(vcpu);
+}
+
+static void vt_write_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc offset can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_offset(vcpu);
+}
+
+static void vt_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+{
+ /* In TDX, tsc multiplier can't be changed. */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_write_tsc_multiplier(vcpu);
+}
+
+#ifdef CONFIG_X86_64
+static int vt_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
+{
+ /* VMX-preemption timer isn't available for TDX. */
+ if (is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return vmx_set_hv_timer(vcpu, guest_deadline_tsc, expired);
+}
+
+static void vt_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ /* VMX-preemption timer can't be set. See vt_set_hv_timer(). */
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_cancel_hv_timer(vcpu);
+}
+#endif
+
+static void vt_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return;
+
+ vmx_setup_mce(vcpu);
+}
+
+static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
+{
+ if (!is_td(kvm))
+ return -ENOTTY;
+
+ return tdx_vm_ioctl(kvm, argp);
+}
+
+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_ioctl(vcpu, argp);
+}
+
+static int vt_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ if (is_td(kvm))
+ return tdx_gmem_private_max_mapping_level(kvm, pfn);
+
+ return 0;
+}
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vt_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
+ .has_emulated_msr = vt_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+
+ .vm_init = vt_vm_init,
+ .vm_pre_destroy = vt_vm_pre_destroy,
+ .vm_destroy = vt_vm_destroy,
+
+ .vcpu_precreate = vt_vcpu_precreate,
+ .vcpu_create = vt_vcpu_create,
+ .vcpu_free = vt_vcpu_free,
+ .vcpu_reset = vt_vcpu_reset,
+
+ .prepare_switch_to_guest = vt_prepare_switch_to_guest,
+ .vcpu_load = vt_vcpu_load,
+ .vcpu_put = vt_vcpu_put,
+
+ .update_exception_bitmap = vt_update_exception_bitmap,
+ .get_feature_msr = vmx_get_feature_msr,
+ .get_msr = vt_get_msr,
+ .set_msr = vt_set_msr,
+
+ .get_segment_base = vt_get_segment_base,
+ .get_segment = vt_get_segment,
+ .set_segment = vt_set_segment,
+ .get_cpl = vt_get_cpl,
+ .get_cpl_no_cache = vt_get_cpl_no_cache,
+ .get_cs_db_l_bits = vt_get_cs_db_l_bits,
+ .is_valid_cr0 = vt_is_valid_cr0,
+ .set_cr0 = vt_set_cr0,
+ .is_valid_cr4 = vt_is_valid_cr4,
+ .set_cr4 = vt_set_cr4,
+ .set_efer = vt_set_efer,
+ .get_idt = vt_get_idt,
+ .set_idt = vt_set_idt,
+ .get_gdt = vt_get_gdt,
+ .set_gdt = vt_set_gdt,
+ .set_dr6 = vt_set_dr6,
+ .set_dr7 = vt_set_dr7,
+ .sync_dirty_debug_regs = vt_sync_dirty_debug_regs,
+ .cache_reg = vt_cache_reg,
+ .get_rflags = vt_get_rflags,
+ .set_rflags = vt_set_rflags,
+ .get_if_flag = vt_get_if_flag,
+
+ .flush_tlb_all = vt_flush_tlb_all,
+ .flush_tlb_current = vt_flush_tlb_current,
+ .flush_tlb_gva = vt_flush_tlb_gva,
+ .flush_tlb_guest = vt_flush_tlb_guest,
+
+ .vcpu_pre_run = vt_vcpu_pre_run,
+ .vcpu_run = vt_vcpu_run,
+ .handle_exit = vt_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vt_set_interrupt_shadow,
+ .get_interrupt_shadow = vt_get_interrupt_shadow,
+ .patch_hypercall = vt_patch_hypercall,
+ .inject_irq = vt_inject_irq,
+ .inject_nmi = vt_inject_nmi,
+ .inject_exception = vt_inject_exception,
+ .cancel_injection = vt_cancel_injection,
+ .interrupt_allowed = vt_interrupt_allowed,
+ .nmi_allowed = vt_nmi_allowed,
+ .get_nmi_mask = vt_get_nmi_mask,
+ .set_nmi_mask = vt_set_nmi_mask,
+ .enable_nmi_window = vt_enable_nmi_window,
+ .enable_irq_window = vt_enable_irq_window,
+ .update_cr8_intercept = vt_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
+ .set_virtual_apic_mode = vt_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vt_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vt_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vt_load_eoi_exitmap,
+ .apicv_pre_state_restore = vt_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_isr_update = vt_hwapic_isr_update,
+ .sync_pir_to_irr = vt_sync_pir_to_irr,
+ .deliver_interrupt = vt_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vt_set_tss_addr,
+ .set_identity_map_addr = vt_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vt_get_exit_info,
+ .get_entry_info = vt_get_entry_info,
+
+ .vcpu_after_set_cpuid = vt_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vt_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vt_get_l2_tsc_multiplier,
+ .write_tsc_offset = vt_write_tsc_offset,
+ .write_tsc_multiplier = vt_write_tsc_multiplier,
+
+ .load_mmu_pgd = vt_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .update_cpu_dirty_logging = vt_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vt_set_hv_timer,
+ .cancel_hv_timer = vt_cancel_hv_timer,
+#endif
+
+ .setup_mce = vt_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vt_smi_allowed,
+ .enter_smm = vt_enter_smm,
+ .leave_smm = vt_leave_smm,
+ .enable_smi_window = vt_enable_smi_window,
+#endif
+
+ .check_emulate_instruction = vt_check_emulate_instruction,
+ .apic_init_signal_blocked = vt_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vt_msr_filter_changed,
+ .complete_emulated_msr = vt_complete_emulated_msr,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+
+ .mem_enc_ioctl = vt_mem_enc_ioctl,
+ .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
+
+ .private_max_mapping_level = vt_gmem_private_max_mapping_level
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vt_hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
+
+static void __exit vt_exit(void)
+{
+ kvm_exit();
+ tdx_cleanup();
+ vmx_exit();
+}
+module_exit(vt_exit);
+
+static int __init vt_init(void)
+{
+ unsigned vcpu_size, vcpu_align;
+ int r;
+
+ r = vmx_init();
+ if (r)
+ return r;
+
+ /* tdx_init() has been taken */
+ r = tdx_bringup();
+ if (r)
+ goto err_tdx_bringup;
+
+ /*
+ * TDX and VMX have different vCPU structures. Calculate the
+ * maximum size/align so that kvm_init() can use the larger
+ * values to create the kmem_vcpu_cache.
+ */
+ vcpu_size = sizeof(struct vcpu_vmx);
+ vcpu_align = __alignof__(struct vcpu_vmx);
+ if (enable_tdx) {
+ vcpu_size = max_t(unsigned, vcpu_size,
+ sizeof(struct vcpu_tdx));
+ vcpu_align = max_t(unsigned, vcpu_align,
+ __alignof__(struct vcpu_tdx));
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_TDX_VM);
+ }
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(vcpu_size, vcpu_align, THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ tdx_cleanup();
+err_tdx_bringup:
+ vmx_exit();
+ return r;
+}
+module_init(vt_init);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d05ddf751491..71701e2414a4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6,16 +6,18 @@
#include <asm/debugreg.h>
#include <asm/mmu_context.h>
+#include <asm/msr.h>
+#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
-#include "x86.h"
#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -230,11 +232,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (nested_vmx_is_evmptr12_valid(vmx)) {
- kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs = NULL;
- }
-
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
+ vmx->nested.hv_evmcs = NULL;
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
if (hv_vcpu) {
@@ -259,7 +258,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
* state. It is possible that the area will stay mapped as
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
- if (!guest_cpuid_has_evmcs(vcpu) ||
+ if (!guest_cpu_cap_has_evmcs(vcpu) ||
!evmptr_is_valid(nested_get_evmptr(vcpu)))
return false;
@@ -277,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
{
struct vmcs_host_state *dest, *src;
- if (unlikely(!vmx->guest_state_loaded))
+ if (unlikely(!vmx->vt.guest_state_loaded))
return;
src = &prev->host_state;
@@ -316,6 +315,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vcpu->arch.regs_dirty = 0;
}
+static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
+ vmx->nested.pi_desc = NULL;
+}
+
/*
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
* just stops using VMX.
@@ -348,15 +357,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.cached_vmcs12 = NULL;
kfree(vmx->nested.cached_shadow_vmcs12);
vmx->nested.cached_shadow_vmcs12 = NULL;
- /*
- * Unpin physical memory we referred to in the vmcs02. The APIC access
- * page's backing page (yeah, confusing) shouldn't actually be accessed,
- * and if it is written, the contents are irrelevant.
- */
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
+
+ nested_put_vmcs12_pages(vcpu);
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -409,18 +411,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
u32 vm_exit_reason;
- unsigned long exit_qualification = vcpu->arch.exit_qualification;
if (vmx->nested.pml_full) {
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
- exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+
+ /*
+ * It should be impossible to trigger a nested PML Full VM-Exit
+ * for anything other than an EPT Violation from L2. KVM *can*
+ * trigger nEPT page fault injection in response to an EPT
+ * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
+ * tables also changed, but KVM should not treat EPT Misconfig
+ * VM-Exits as writes.
+ */
+ WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
+
+ /*
+ * PML Full and EPT Violation VM-Exits both use bit 12 to report
+ * "NMI unblocking due to IRET", i.e. the bit can be propagated
+ * as-is from the original EXIT_QUALIFICATION.
+ */
+ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
} else {
- if (fault->error_code & PFERR_RSVD_MASK)
+ if (fault->error_code & PFERR_RSVD_MASK) {
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
- else
+ exit_qualification = 0;
+ } else {
+ exit_qualification = fault->exit_qualification;
+ exit_qualification |= vmx_get_exit_qual(vcpu) &
+ (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ }
/*
* Although the caller (kvm_inject_emulated_page_fault) would
@@ -601,7 +625,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
- struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
+ struct kvm_host_map map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -624,10 +648,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return true;
}
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
+ if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
return false;
- msr_bitmap_l1 = (unsigned long *)map->hva;
+ msr_bitmap_l1 = (unsigned long *)map.hva;
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -691,7 +715,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
- kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
+ kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -958,7 +982,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- if (kvm_set_msr(vcpu, e.index, e.value)) {
+ if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -994,7 +1018,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
- if (kvm_get_msr(vcpu, msr_index, data)) {
+ if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@@ -1089,9 +1113,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
- * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
- * instead of reading the value from the vmcs02 VMExit
- * MSR-store area.
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@@ -1174,11 +1198,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
- * full TLB flush from the guest's perspective. This is required even
- * if VPID is disabled in the host as KVM may need to synchronize the
- * MMU in response to the guest TLB flush.
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host, and so architecturally, linear and combined
+ * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
+ * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
+ * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
+ * is required if VPID is disabled in KVM, as a TLB flush (there are no
+ * VPIDs) still occurs from L1's perspective, and KVM may need to
+ * synchronize the MMU in response to the guest TLB flush.
*
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
* EPT is a special snowflake, as guest-physical mappings aren't
@@ -1228,21 +1255,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved =
- /* feature (except bit 48; see below) */
- BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
- /* reserved */
- BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ VMX_BASIC_INOUT |
+ VMX_BASIC_TRUE_CTLS;
+
+ const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ GENMASK_ULL(47, 45) |
+ BIT_ULL(31);
+
u64 vmx_basic = vmcs_config.nested.basic;
- if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ * inverted polarity), the incoming value must not set feature bits or
+ * reserved bits that aren't allowed/supported by KVM. Fields, i.e.
+ * multi-bit values, are explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
return -EINVAL;
/*
* KVM does not emulate a version of VMX that constrains physical
* addresses of VMX structures (e.g. VMCS) to 32-bits.
*/
- if (data & BIT_ULL(48))
+ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EINVAL;
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@ -1311,16 +1349,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved_bits =
- /* feature */
- BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
- BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
- /* reserved */
- GENMASK_ULL(13, 9) | BIT_ULL(31);
+ const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_SHUTDOWN |
+ VMX_MISC_ACTIVITY_WAIT_SIPI |
+ VMX_MISC_INTEL_PT |
+ VMX_MISC_RDMSR_IN_SMM |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMXOFF_BLOCK_SMI |
+ VMX_MISC_ZERO_LEN_INS;
+
+ const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
vmcs_config.nested.misc_high);
- if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * The incoming value must not set feature bits or reserved bits that
+ * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
+ * explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
return -EINVAL;
if ((vmx->nested.msrs.pinbased_ctls_high &
@@ -2039,7 +2090,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
- if (likely(!guest_cpuid_has_evmcs(vcpu)))
+ if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
evmcs_gpa = nested_get_evmptr(vcpu);
@@ -2220,6 +2271,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmcs_write64(EPT_POINTER,
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
+
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -2265,6 +2319,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host. Emulate this behavior by using vpid01 for L2
+ * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
+ * and VM-Exit are architecturally required to flush VPID=0, but *only*
+ * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
+ * required flushes), but doing so would cause KVM to over-flush. E.g.
+ * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
+ * and then runs L2 X again, then KVM can and should retain TLB entries
+ * for VPID12=1.
+ */
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
@@ -2291,10 +2356,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
- if (nested_cpu_has_posted_intr(vmcs12))
+ if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- else
+ } else {
+ vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
pin_controls_set(vmx, exec_control);
/*
@@ -2400,7 +2467,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2413,7 +2480,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -2444,6 +2511,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@ -2481,7 +2549,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmx->segment_cache.bitmask = 0;
+ vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2903,7 +2971,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
case INTR_TYPE_SOFT_INTR:
case INTR_TYPE_PRIV_SW_EXCEPTION:
- if (CC(vmcs12->vm_entry_instruction_len > 15) ||
+ if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) ||
CC(vmcs12->vm_entry_instruction_len == 0 &&
CC(!nested_cpu_has_zero_length_injection(vcpu))))
return -EINVAL;
@@ -2925,7 +2993,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_KVM_HYPERV
- if (guest_cpuid_has_evmcs(vcpu))
+ if (guest_cpu_cap_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
#endif
@@ -2943,6 +3011,17 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
return 0;
}
+static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
+{
+ /*
+ * Check that the given linear address is canonical after a VM exit
+ * from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
+ */
+ u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
+
+ return !__is_canonical_address(la, l1_address_bits_on_exit);
+}
+
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -2953,8 +3032,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+ if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
return -EINVAL;
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
@@ -2988,12 +3067,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(vmcs12->host_ss_selector == 0 && !ia32e))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
+ if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
return -EINVAL;
/*
@@ -3111,7 +3190,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
return -EINVAL;
@@ -3209,7 +3288,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (guest_cpuid_has_evmcs(vcpu) &&
+ if (guest_cpu_cap_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@ -3364,7 +3443,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
if (!nested_cpu_has_pml(vmcs12))
return 0;
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
vmx->nested.pml_full = true;
return 1;
}
@@ -3403,14 +3482,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
-static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
-{
- u8 rvi = vmx_get_rvi();
- u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3430,7 +3501,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
- bool evaluate_pending_interrupts;
union vmx_exit_reason exit_reason = {
.basic = EXIT_REASON_INVALID_STATE,
.failed_vmentry = 1,
@@ -3449,13 +3519,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
- evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
- if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
- evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- if (!evaluate_pending_interrupts)
- evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
-
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
@@ -3538,9 +3601,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
- * unconditionally.
+ * unconditionally. Take care to pull data from vmcs01 as appropriate,
+ * e.g. when checking for interrupt windows, as vmcs02 is now loaded.
*/
- if (unlikely(evaluate_pending_interrupts))
+ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING)) ||
+ kvm_apic_has_pending_init_or_sipi(vcpu) ||
+ kvm_apic_has_interrupt(vcpu))
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
@@ -3673,14 +3740,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
- /* Emulate processing of posted interrupts on VM-Enter. */
- if (nested_cpu_has_posted_intr(vmcs12) &&
- kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
- vmx->nested.pi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
- }
-
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3713,7 +3772,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
break;
case GUEST_ACTIVITY_WAIT_SIPI:
vmx->nested.nested_run_pending = 0;
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED);
break;
default:
break;
@@ -3874,8 +3933,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
@@ -4006,10 +4065,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
@@ -4106,13 +4201,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
*/
bool block_nested_exceptions = vmx->nested.nested_run_pending;
/*
- * New events (not exceptions) are only recognized at instruction
+ * Events that don't require injection, i.e. that are virtualized by
+ * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
+ * to regain control in order to deliver the event, and hardware will
+ * handle event ordering, e.g. with respect to injected exceptions.
+ *
+ * But, new events (not exceptions) are only recognized at instruction
* boundaries. If an event needs reinjection, then KVM is handling a
- * VM-Exit that occurred _during_ instruction execution; new events are
- * blocked until the instruction completes.
+ * VM-Exit that occurred _during_ instruction execution; new events,
+ * irrespective of whether or not they're injected, are blocked until
+ * the instruction completes.
+ */
+ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
+ /*
+ * Inject events are blocked by nested VM-Enter, as KVM is responsible
+ * for managing priority between concurrent events, i.e. KVM needs to
+ * wait until after VM-Enter completes to deliver injected events.
*/
bool block_nested_events = block_nested_exceptions ||
- kvm_event_needs_reinjection(vcpu);
+ block_non_injected_events;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -4222,11 +4329,71 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
+ if (!nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ goto no_vmexit;
+ }
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ /*
+ * Nested posted interrupts are delivered via RVI, i.e.
+ * aren't injected by KVM, and so can be queued even if
+ * manual event injection is disallowed.
+ */
+ if (block_non_injected_events)
+ return -EBUSY;
+
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
if (block_nested_events)
return -EBUSY;
- if (!nested_exit_on_intr(vcpu))
- goto no_vmexit;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@@ -4452,11 +4619,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
*/
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
u32 vm_exit_reason, u32 exit_intr_info,
- unsigned long exit_qualification)
+ unsigned long exit_qualification, u32 exit_insn_len)
{
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
- if (to_vmx(vcpu)->exit_reason.enclave_mode)
+ if (vmx_get_exit_reason(vcpu).enclave_mode)
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
@@ -4480,7 +4647,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vm_exit_reason, exit_intr_info);
vmcs12->vm_exit_intr_info = exit_intr_info;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vm_exit_instruction_len = exit_insn_len;
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
@@ -4628,7 +4795,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
- to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+ to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
@@ -4640,7 +4807,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4651,7 +4818,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
@@ -4744,7 +4911,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- if (kvm_set_msr(vcpu, h.index, h.value)) {
+ if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4764,8 +4931,9 @@ vmabort:
* and modify vmcs12 to make it see what it would expect to see there if
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
*/
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
- u32 exit_intr_info, unsigned long exit_qualification)
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -4815,7 +4983,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
if (vm_exit_reason != -1)
prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
- exit_intr_info, exit_qualification);
+ exit_intr_info, exit_qualification,
+ exit_insn_len);
/*
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
@@ -4860,7 +5029,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
* doesn't isolate different VMCSs, i.e. in this case, doesn't provide
* separate modes for L2 vs L1.
*/
- if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
indirect_branch_prediction_barrier();
/* Update any VMCS fields that might have changed while L2 ran */
@@ -4883,11 +5052,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_update_cpu_dirty_logging(vcpu);
}
- /* Unpin physical memory we referred to in vmcs02 */
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
+ nested_put_vmcs12_pages(vcpu);
if (vmx->nested.reload_vmcs01_apic_access_page) {
vmx->nested.reload_vmcs01_apic_access_page = false;
@@ -4899,22 +5064,19 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
+ if (vmx->nested.update_vmcs01_hwapic_isr) {
+ vmx->nested.update_vmcs01_hwapic_isr = false;
+ kvm_apic_update_hwapic_isr(vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
if (likely(!vmx->fail)) {
- if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- nested_exit_intr_ack_set(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
@@ -4925,6 +5087,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
+ /*
+ * Process events if an injectable IRQ or NMI is pending, even
+ * if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
+ * If an event became pending while L2 was active, KVM needs to
+ * either inject the event or request an IRQ/NMI window. SMIs
+ * don't need to be processed as SMM is mutually exclusive with
+ * non-root mode. INIT/SIPI don't need to be checked as INIT
+ * is blocked post-VMXON, and SIPIs are ignored.
+ */
+ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return;
}
@@ -5031,7 +5204,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
- exn = is_noncanonical_address(*ret, vcpu);
+ exn = is_noncanonical_address(*ret, vcpu, 0);
} else {
/*
* When not in long mode, the virtual/linear address is
@@ -5157,9 +5330,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
goto out_shadow_vmcs;
- hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED);
- vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+ hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
vmx->nested.vpid02 = allocate_vpid();
@@ -5828,6 +6000,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ /*
+ * Always flush the effective vpid02, i.e. never flush the current VPID
+ * and never explicitly flush vpid01. INVVPID targets a VPID, not a
+ * VMCS, and so whether or not the current vmcs12 has VPID enabled is
+ * irrelevant (and there may not be a loaded vmcs12).
+ */
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
@@ -5836,7 +6014,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* invalidation.
*/
if (!operand.vpid ||
- is_noncanonical_address(operand.gla, vcpu))
+ is_noncanonical_invlpg_address(operand.gla, vcpu))
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla);
@@ -5950,7 +6128,7 @@ fail:
* nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
* EXIT_REASON_VMFUNC as the exit reason.
*/
- nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
+ nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full,
vmx_get_intr_info(vcpu),
vmx_get_exit_qual(vcpu));
return 1;
@@ -6130,7 +6308,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
{
u32 encls_leaf;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
return false;
@@ -6208,6 +6386,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_alignment_check(intr_info) &&
!vmx_guest_inject_ac(vcpu))
return true;
+ else if (is_ve_fault(intr_info))
+ return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;
@@ -6393,7 +6573,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->vt.exit_reason;
unsigned long exit_qual;
u32 exit_intr_info;
@@ -6466,7 +6646,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
@@ -6607,7 +6787,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return -EINVAL;
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
@@ -6641,7 +6821,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
- (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
!vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
@@ -6987,7 +7167,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
@@ -7002,12 +7182,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- msrs->basic =
- VMCS12_REVISION |
- VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ X86_MEMTYPE_WB);
+ msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
}
@@ -7025,8 +7203,8 @@ static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
/* These MSRs specify bits which the guest must keep fixed off. */
- rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
- rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
+ rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
+ rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
if (vmx_umip_emulated())
msrs->cr4_fixed1 |= X86_CR4_UMIP;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index cce4e2aa30fb..6eedcfc91070 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry);
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
- u32 exit_intr_info, unsigned long exit_qualification);
+void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info, unsigned long exit_qualification,
+ u32 exit_insn_len);
+
+static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
+ u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ u32 exit_insn_len;
+
+ if (to_vmx(vcpu)->fail || vm_exit_reason == -1 ||
+ (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+ exit_insn_len = 0;
+ else
+ exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info,
+ exit_qualification, exit_insn_len);
+}
+
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
@@ -39,11 +57,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
@@ -109,7 +133,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.msrs.misc_low &
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
}
static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index be40474de6e4..bbf4509f32d0 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -13,12 +13,14 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <asm/msr.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
#include "nested.h"
#include "pmu.h"
+#include "tdx.h"
/*
* Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
@@ -34,6 +36,24 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+static struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc;
+}
+
+static struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return NULL;
+
+ return &to_vmx(vcpu)->lbr_desc.records;
+}
+
+#pragma GCC poison to_vmx
+
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
struct kvm_pmc *pmc;
@@ -110,7 +130,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return 0;
return vcpu->arch.perf_capabilities;
@@ -129,6 +149,22 @@ static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
}
+static bool intel_pmu_lbr_is_compatible(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return cpuid_model_is_consistent(vcpu);
+}
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return false;
+
+ return !!vcpu_to_lbr_records(vcpu)->nr;
+}
+
static bool intel_pmu_is_valid_lbr_msr(struct kvm_vcpu *vcpu, u32 index)
{
struct x86_pmu_lbr *records = vcpu_to_lbr_records(vcpu);
@@ -160,7 +196,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
break;
case MSR_IA32_DS_AREA:
- ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS);
break;
case MSR_PEBS_DATA_CFG:
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
@@ -194,6 +230,9 @@ static inline void intel_pmu_release_guest_lbr_event(struct kvm_vcpu *vcpu)
{
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (!lbr_desc)
+ return;
+
if (lbr_desc->event) {
perf_event_release_kernel(lbr_desc->event);
lbr_desc->event = NULL;
@@ -235,6 +274,9 @@ int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu)
PERF_SAMPLE_BRANCH_USER,
};
+ if (WARN_ON_ONCE(!lbr_desc))
+ return 0;
+
if (unlikely(lbr_desc->event)) {
__set_bit(INTEL_PMC_IDX_FIXED_VLBR, pmu->pmc_in_use);
return 0;
@@ -279,9 +321,9 @@ static bool intel_pmu_handle_lbr_msrs_access(struct kvm_vcpu *vcpu,
local_irq_disable();
if (lbr_desc->event->state == PERF_EVENT_STATE_ACTIVE) {
if (read)
- rdmsrl(index, msr_info->data);
+ rdmsrq(index, msr_info->data);
else
- wrmsrl(index, msr_info->data);
+ wrmsrq(index, msr_info->data);
__set_bit(INTEL_PMC_IDX_FIXED_VLBR, vcpu_to_pmu(vcpu)->pmc_in_use);
local_irq_enable();
return true;
@@ -348,14 +390,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (data & pmu->fixed_ctr_ctrl_mask)
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
return 1;
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
case MSR_IA32_PEBS_ENABLE:
- if (data & pmu->pebs_enable_mask)
+ if (data & pmu->pebs_enable_rsvd)
return 1;
if (pmu->pebs_enable != data) {
@@ -365,13 +407,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
case MSR_IA32_DS_AREA:
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
pmu->ds_area = data;
break;
case MSR_PEBS_DATA_CFG:
- if (data & pmu->pebs_data_cfg_mask)
+ if (data & pmu->pebs_data_cfg_rsvd)
return 1;
pmu->pebs_data_cfg = data;
@@ -436,8 +478,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
};
u64 eventsel;
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
- BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS);
/*
* Yell if perf reports support for a fixed counter but perf doesn't
@@ -448,6 +490,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
return eventsel;
}
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -456,8 +506,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid10_eax eax;
union cpuid10_edx edx;
u64 perf_capabilities;
- u64 counter_mask;
- int i;
+ u64 counter_rsvd;
+
+ if (!lbr_desc)
+ return;
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
@@ -501,22 +553,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
((u64)1 << edx.split.bit_width_fixed) - 1;
}
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
- pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
- counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
- pmu->global_ctrl_mask = counter_mask;
+ pmu->global_ctrl_rsvd = counter_rsvd;
/*
* GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
* share reserved bit definitions. The kernel just happens to use
* OVF_CTRL for the names.
*/
- pmu->global_status_mask = pmu->global_ctrl_mask
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_status_mask &=
+ pmu->global_status_rsvd &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -533,7 +587,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
- if (cpuid_model_is_consistent(vcpu) &&
+ if (intel_pmu_lbr_is_compatible(vcpu) &&
(perf_capabilities & PMU_CAP_LBR_FMT))
memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
else
@@ -544,15 +598,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
- pmu->pebs_enable_mask = counter_mask;
+ pmu->pebs_enable_rsvd = counter_rsvd;
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
- }
- pmu->pebs_data_cfg_mask = ~0xff00000full;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
} else {
- pmu->pebs_enable_mask =
+ pmu->pebs_enable_rsvd =
~((1ull << pmu->nr_arch_gp_counters) - 1);
}
}
@@ -564,14 +615,17 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ if (!lbr_desc)
+ return;
+
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
@@ -671,6 +725,9 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
+ if (WARN_ON_ONCE(!lbr_desc))
+ return;
+
if (!lbr_desc->event) {
vmx_disable_lbr_msrs_passthrough(vcpu);
if (vmcs_read64(GUEST_IA32_DEBUGCTL) & DEBUGCTLMSR_LBR)
@@ -731,6 +788,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h
new file mode 100644
index 000000000000..5620d0882cdc
--- /dev/null
+++ b/arch/x86/kvm/vmx/pmu_intel.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_PMU_INTEL_H
+#define __KVM_X86_VMX_PMU_INTEL_H
+
+#include <linux/kvm_host.h>
+
+bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu);
+int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
+
+struct lbr_desc {
+ /* Basic info about guest LBR records. */
+ struct x86_pmu_lbr records;
+
+ /*
+ * Emulate LBR feature via passthrough LBR registers when the
+ * per-vcpu guest LBR event is scheduled on the current pcpu.
+ *
+ * The records may be inaccurate if the host reclaims the LBR.
+ */
+ struct perf_event *event;
+
+ /* True if LBRs are marked as not intercepted in the MSR bitmap */
+ bool msr_passthrough;
+};
+
+extern struct x86_pmu_lbr vmx_lbr_caps;
+
+#endif /* __KVM_X86_VMX_PMU_INTEL_H */
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index af662312fd07..99d1d599ff8c 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -11,6 +11,7 @@
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
+#include "tdx.h"
/*
* Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
@@ -31,9 +32,11 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
*/
static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
+
+struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
- return &(to_vmx(vcpu)->pi_desc);
+ return &(to_vt(vcpu)->pi_desc);
}
static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
@@ -53,7 +56,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct pi_desc old, new;
unsigned long flags;
unsigned int dest;
@@ -89,9 +92,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* current pCPU if the task was migrated.
*/
if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
- raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- list_del(&vmx->pi_wakeup_list);
- raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);
+
+ /*
+ * In addition to taking the wakeup lock for the regular/IRQ
+ * context, tell lockdep it is being taken for the "sched out"
+ * context as well. vCPU loads happens in task context, and
+ * this is taking the lock of the *previous* CPU, i.e. can race
+ * with both the scheduler and the wakeup handler.
+ */
+ raw_spin_lock(spinlock);
+ spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
+ list_del(&vt->pi_wakeup_list);
+ spin_release(&spinlock->dep_map, _RET_IP_);
+ raw_spin_unlock(spinlock);
}
dest = cpu_physical_id(cpu);
@@ -107,7 +121,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest;
- new.sn = 0;
+ __pi_clear_sn(&new);
/*
* Restore the notification vector; in the blocking case, the
@@ -146,18 +160,30 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct pi_desc old, new;
- unsigned long flags;
- local_irq_save(flags);
+ lockdep_assert_irqs_disabled();
- raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- list_add_tail(&vmx->pi_wakeup_list,
+ /*
+ * Acquire the wakeup lock using the "sched out" context to workaround
+ * a lockdep false positive. When this is called, schedule() holds
+ * various per-CPU scheduler locks. When the wakeup handler runs, it
+ * holds this CPU's wakeup lock while calling try_to_wake_up(), which
+ * can eventually take the aforementioned scheduler locks, which causes
+ * lockdep to assume there is deadlock.
+ *
+ * Deadlock can't actually occur because IRQs are disabled for the
+ * entirety of the sched_out critical section, i.e. the wakeup handler
+ * can't run while the scheduler locks are held.
+ */
+ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
+ PI_LOCK_SCHED_OUT);
+ list_add_tail(&vt->pi_wakeup_list,
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+ WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
old.control = READ_ONCE(pi_desc->control);
do {
@@ -176,8 +202,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
*/
if (pi_test_on(&new))
__apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
-
- local_irq_restore(flags);
}
static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
@@ -190,7 +214,8 @@ static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu)
* notification vector is switched to the one that calls
* back to the pi_wakeup_handler() function.
*/
- return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(vcpu->kvm);
+ return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) ||
+ vmx_can_use_vtd_pi(vcpu->kvm);
}
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -200,7 +225,9 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
if (!vmx_needs_pi_wakeup(vcpu))
return;
- if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ if (kvm_vcpu_is_blocking(vcpu) &&
+ ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) ||
+ (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu))))
pi_enable_wakeup_handler(vcpu);
/*
@@ -220,13 +247,13 @@ void pi_wakeup_handler(void)
int cpu = smp_processor_id();
struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu);
raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu);
- struct vcpu_vmx *vmx;
+ struct vcpu_vt *vt;
raw_spin_lock(spinlock);
- list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) {
+ list_for_each_entry(vt, wakeup_list, pi_wakeup_list) {
- if (pi_test_on(&vmx->pi_desc))
- kvm_vcpu_wake_up(&vmx->vcpu);
+ if (pi_test_on(&vt->pi_desc))
+ kvm_vcpu_wake_up(vt_to_vcpu(vt));
}
raw_spin_unlock(spinlock);
}
@@ -274,6 +301,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
+ bool enable_remapped_mode = true;
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
@@ -312,21 +340,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
+ !kvm_irq_is_postable(&irq))
continue;
- }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
@@ -334,11 +349,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (!set)
+ continue;
+
+ enable_remapped_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
__func__);
@@ -346,6 +362,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
}
+ if (enable_remapped_mode)
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..68605ca7ef68 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -2,97 +2,10 @@
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
+#include <linux/bitmap.h>
+#include <asm/posted_intr.h>
-#define PID_TABLE_ENTRY_VALID 1
-
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu);
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
@@ -103,4 +16,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6fef01e0536e..df1d0cf76947 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -2,14 +2,14 @@
/* Copyright(c) 2021 Intel Corporation. */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <asm/msr.h>
#include <asm/sgx.h>
-#include "cpuid.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include "nested.h"
#include "sgx.h"
#include "vmx.h"
-#include "x86.h"
bool __read_mostly enable_sgx = 1;
module_param_named(sgx, enable_sgx, bool, 0444);
@@ -38,7 +38,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
fault = true;
} else if (likely(is_64_bit_mode(vcpu))) {
*gva = vmx_get_untagged_addr(vcpu, *gva, 0);
- fault = is_noncanonical_address(*gva, vcpu);
+ fault = is_noncanonical_address(*gva, vcpu, 0);
} else {
*gva &= 0xffffffff;
fault = (s.unusable) ||
@@ -123,7 +123,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
* likely than a bad userspace address.
*/
if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
- guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) {
memset(&ex, 0, sizeof(ex));
ex.vector = PF_VECTOR;
ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
@@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
- contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;
@@ -366,7 +366,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
return true;
if (leaf >= EAUG && leaf <= EMODT)
- return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2);
return false;
}
@@ -382,8 +382,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
{
u32 leaf = (u32)kvm_rax_read(vcpu);
- if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
kvm_queue_exception(vcpu, UD_VECTOR);
} else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
!sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
@@ -412,16 +412,16 @@ void setup_default_sgx_lepubkeyhash(void)
* MSRs exist but are read-only (locked and not writable).
*/
if (!enable_sgx || boot_cpu_has(X86_FEATURE_SGX_LC) ||
- rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
+ rdmsrq_safe(MSR_IA32_SGXLEPUBKEYHASH0, &sgx_pubkey_hash[0])) {
sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
} else {
/* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
- rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
- rdmsrl(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
- rdmsrl(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH2, sgx_pubkey_hash[2]);
+ rdmsrq(MSR_IA32_SGXLEPUBKEYHASH3, sgx_pubkey_hash[3]);
}
}
@@ -480,15 +480,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!cpu_has_vmx_encls_vmexit())
return;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) &&
sgx_enabled_in_guest_bios(vcpu)) {
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
if (sgx_intercept_encls_ecreate(vcpu))
bitmap |= (1 << ECREATE);
}
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
bitmap &= ~GENMASK_ULL(EMODT, EAUG);
/*
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
new file mode 100644
index 000000000000..b952bc673271
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -0,0 +1,3526 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <asm/cpufeature.h>
+#include <asm/fpu/xcr.h>
+#include <linux/misc_cgroup.h>
+#include <linux/mmu_context.h>
+#include <asm/tdx.h>
+#include "capabilities.h"
+#include "mmu.h"
+#include "x86_ops.h"
+#include "lapic.h"
+#include "tdx.h"
+#include "vmx.h"
+#include "mmu/spte.h"
+#include "common.h"
+#include "posted_intr.h"
+#include "irq.h"
+#include <trace/events/kvm.h>
+#include "trace.h"
+
+#pragma GCC poison to_vmx
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#define pr_tdx_error(__fn, __err) \
+ pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+
+#define __pr_tdx_error_N(__fn_str, __err, __fmt, ...) \
+ pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt, __err, __VA_ARGS__)
+
+#define pr_tdx_error_1(__fn, __err, __rcx) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
+
+#define pr_tdx_error_2(__fn, __err, __rcx, __rdx) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
+
+#define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8) \
+ __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
+
+bool enable_tdx __ro_after_init;
+module_param_named(tdx, enable_tdx, bool, 0444);
+
+#define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
+#define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
+
+static enum cpuhp_state tdx_cpuhp_state;
+
+static const struct tdx_sys_info *tdx_sysinfo;
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
+}
+
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err)
+{
+ KVM_BUG_ON(1, tdx->vcpu.kvm);
+ pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
+}
+
+#define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
+
+static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
+{
+ return container_of(kvm, struct kvm_tdx, kvm);
+}
+
+static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
+{
+ return container_of(vcpu, struct vcpu_tdx, vcpu);
+}
+
+static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = KVM_SUPPORTED_TD_ATTRS;
+
+ if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
+ return 0;
+
+ val &= td_conf->attributes_fixed0;
+
+ return val;
+}
+
+static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
+{
+ u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
+
+ if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
+ return 0;
+
+ val &= td_conf->xfam_fixed0;
+
+ return val;
+}
+
+static int tdx_get_guest_phys_addr_bits(const u32 eax)
+{
+ return (eax & GENMASK(23, 16)) >> 16;
+}
+
+static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
+{
+ return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
+}
+
+#define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
+
+static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ebx & TDX_FEATURE_TSX);
+}
+
+static void clear_tsx(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ebx &= ~TDX_FEATURE_TSX;
+}
+
+static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
+{
+ return entry->function == 7 && entry->index == 0 &&
+ (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
+}
+
+static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
+{
+ entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
+}
+
+static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+{
+ if (has_tsx(entry))
+ clear_tsx(entry);
+
+ if (has_waitpkg(entry))
+ clear_waitpkg(entry);
+}
+
+static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+{
+ return has_tsx(entry) || has_waitpkg(entry);
+}
+
+#define KVM_TDX_CPUID_NO_SUBLEAF ((__u32)-1)
+
+static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+
+ entry->function = (u32)td_conf->cpuid_config_leaves[idx];
+ entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
+ entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
+ entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
+ entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
+ entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
+
+ if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
+ entry->index = 0;
+
+ /*
+ * The TDX module doesn't allow configuring the guest phys addr bits
+ * (EAX[23:16]). However, KVM uses it as an interface to the userspace
+ * to configure the GPAW. Report these bits as configurable.
+ */
+ if (entry->function == 0x80000008)
+ entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
+
+ tdx_clear_unsupported_cpuid(entry);
+}
+
+static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
+ struct kvm_tdx_capabilities *caps)
+{
+ int i;
+
+ caps->supported_attrs = tdx_get_supported_attrs(td_conf);
+ if (!caps->supported_attrs)
+ return -EIO;
+
+ caps->supported_xfam = tdx_get_supported_xfam(td_conf);
+ if (!caps->supported_xfam)
+ return -EIO;
+
+ caps->cpuid.nent = td_conf->num_cpuid_config;
+
+ for (i = 0; i < td_conf->num_cpuid_config; i++)
+ td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
+
+ return 0;
+}
+
+/*
+ * Some SEAMCALLs acquire the TDX module globally, and can fail with
+ * TDX_OPERAND_BUSY. Use a global mutex to serialize these SEAMCALLs.
+ */
+static DEFINE_MUTEX(tdx_lock);
+
+static atomic_t nr_configured_hkid;
+
+static bool tdx_operand_busy(u64 err)
+{
+ return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
+}
+
+
+/*
+ * A per-CPU list of TD vCPUs associated with a given CPU.
+ * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
+ * list.
+ * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
+ * the old CPU during the IPI callback running on the old CPU, and then added
+ * to the per-CPU list of the new CPU.
+ * - When a TD is tearing down, all vCPUs are disassociated from their current
+ * running CPUs and removed from the per-CPU list during the IPI callback
+ * running on those CPUs.
+ * - When a CPU is brought down, traverse the per-CPU list to disassociate all
+ * associated TD vCPUs and remove them from the per-CPU list.
+ */
+static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+
+static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r10;
+}
+
+static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
+{
+ return to_tdx(vcpu)->vp_enter_args.r11;
+}
+
+static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
+ long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r10 = val;
+}
+
+static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
+ unsigned long val)
+{
+ to_tdx(vcpu)->vp_enter_args.r11 = val;
+}
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+ tdx_guest_keyid_free(kvm_tdx->hkid);
+ kvm_tdx->hkid = -1;
+ atomic_dec(&nr_configured_hkid);
+ misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ put_misc_cg(kvm_tdx->misc_cg);
+ kvm_tdx->misc_cg = NULL;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->hkid > 0;
+}
+
+static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
+{
+ lockdep_assert_irqs_disabled();
+
+ list_del(&to_tdx(vcpu)->cpu_list);
+
+ /*
+ * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
+ * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
+ * to its list before it's deleted from this CPU's list.
+ */
+ smp_wmb();
+
+ vcpu->cpu = -1;
+}
+
+static void tdx_clear_page(struct page *page)
+{
+ const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
+ void *dest = page_to_virt(page);
+ unsigned long i;
+
+ /*
+ * The page could have been poisoned. MOVDIR64B also clears
+ * the poison bit so the kernel can safely use the page again.
+ */
+ for (i = 0; i < PAGE_SIZE; i += 64)
+ movdir64b(dest + i, zero_page);
+ /*
+ * MOVDIR64B store uses WC buffer. Prevent following memory reads
+ * from seeing potentially poisoned cache.
+ */
+ __mb();
+}
+
+static void tdx_no_vcpus_enter_start(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
+}
+
+/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+static int __tdx_reclaim_page(struct page *page)
+{
+ u64 err, rcx, rdx, r8;
+
+ err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
+
+ /*
+ * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
+ * before the HKID is released and control pages have also been
+ * released at this point, so there is no possibility of contention.
+ */
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int tdx_reclaim_page(struct page *page)
+{
+ int r;
+
+ r = __tdx_reclaim_page(page);
+ if (!r)
+ tdx_clear_page(page);
+ return r;
+}
+
+
+/*
+ * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+ * private KeyID. Assume the cache associated with the TDX private KeyID has
+ * been flushed.
+ */
+static void tdx_reclaim_control_page(struct page *ctrl_page)
+{
+ /*
+ * Leak the page if the kernel failed to reclaim the page.
+ * The kernel cannot use it safely anymore.
+ */
+ if (tdx_reclaim_page(ctrl_page))
+ return;
+
+ __free_page(ctrl_page);
+}
+
+struct tdx_flush_vp_arg {
+ struct kvm_vcpu *vcpu;
+ u64 err;
+};
+
+static void tdx_flush_vp(void *_arg)
+{
+ struct tdx_flush_vp_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ u64 err;
+
+ arg->err = 0;
+ lockdep_assert_irqs_disabled();
+
+ /* Task migration can race with CPU offlining. */
+ if (unlikely(vcpu->cpu != raw_smp_processor_id()))
+ return;
+
+ /*
+ * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
+ * list tracking still needs to be updated so that it's correct if/when
+ * the vCPU does get initialized.
+ */
+ if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
+ /*
+ * No need to retry. TDX Resources needed for TDH.VP.FLUSH are:
+ * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
+ * vp flush function is called when destructing vCPU/TD or vCPU
+ * migration. No other thread uses TDVPR in those cases.
+ */
+ err = tdh_vp_flush(&to_tdx(vcpu)->vp);
+ if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
+ /*
+ * This function is called in IPI context. Do not use
+ * printk to avoid console semaphore.
+ * The caller prints out the error message, instead.
+ */
+ if (err)
+ arg->err = err;
+ }
+ }
+
+ tdx_disassociate_vp(vcpu);
+}
+
+static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
+{
+ struct tdx_flush_vp_arg arg = {
+ .vcpu = vcpu,
+ };
+ int cpu = vcpu->cpu;
+
+ if (unlikely(cpu == -1))
+ return;
+
+ smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
+ if (KVM_BUG_ON(arg.err, vcpu->kvm))
+ pr_tdx_error(TDH_VP_FLUSH, arg.err);
+}
+
+void tdx_disable_virtualization_cpu(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+ struct tdx_flush_vp_arg arg;
+ struct vcpu_tdx *tdx, *tmp;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+ list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
+ arg.vcpu = &tdx->vcpu;
+ tdx_flush_vp(&arg);
+ }
+ local_irq_restore(flags);
+}
+
+#define TDX_SEAMCALL_RETRIES 10000
+
+static void smp_func_do_phymem_cache_wb(void *unused)
+{
+ u64 err = 0;
+ bool resume;
+ int i;
+
+ /*
+ * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+ * KeyID on the package or core. The TDX module may not finish the
+ * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead. The
+ * kernel should retry it until it returns success w/o rescheduling.
+ */
+ for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+ resume = !!err;
+ err = tdh_phymem_cache_wb(resume);
+ switch (err) {
+ case TDX_INTERRUPTED_RESUMABLE:
+ continue;
+ case TDX_NO_HKID_READY_TO_WBCACHE:
+ err = TDX_SUCCESS; /* Already done by other thread */
+ fallthrough;
+ default:
+ goto out;
+ }
+ }
+
+out:
+ if (WARN_ON_ONCE(err))
+ pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+ bool packages_allocated, targets_allocated;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages, targets;
+ struct kvm_vcpu *vcpu;
+ unsigned long j;
+ int i;
+ u64 err;
+
+ if (!is_hkid_assigned(kvm_tdx))
+ return;
+
+ packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+ targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+ cpus_read_lock();
+
+ kvm_for_each_vcpu(j, vcpu, kvm)
+ tdx_flush_vp_on_cpu(vcpu);
+
+ /*
+ * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+ * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+ * Multiple TDX guests can be destroyed simultaneously. Take the
+ * mutex to prevent it from getting error.
+ */
+ mutex_lock(&tdx_lock);
+
+ /*
+ * Releasing HKID is in vm_destroy().
+ * After the above flushing vps, there should be no more vCPU
+ * associations, as all vCPU fds have been released at this stage.
+ */
+ err = tdh_mng_vpflushdone(&kvm_tdx->td);
+ if (err == TDX_FLUSHVP_NOT_DONE)
+ goto out;
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+ pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ goto out;
+ }
+
+ for_each_online_cpu(i) {
+ if (packages_allocated &&
+ cpumask_test_and_set_cpu(topology_physical_package_id(i),
+ packages))
+ continue;
+ if (targets_allocated)
+ cpumask_set_cpu(i, targets);
+ }
+ if (targets_allocated)
+ on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+ else
+ on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+ /*
+ * In the case of error in smp_func_do_phymem_cache_wb(), the following
+ * tdh_mng_key_freeid() will fail.
+ */
+ err = tdh_mng_key_freeid(&kvm_tdx->td);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+ pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+ kvm_tdx->hkid);
+ } else {
+ tdx_hkid_free(kvm_tdx);
+ }
+
+out:
+ mutex_unlock(&tdx_lock);
+ cpus_read_unlock();
+ free_cpumask_var(targets);
+ free_cpumask_var(packages);
+}
+
+static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+ int i;
+
+ /*
+ * tdx_mmu_release_hkid() failed to reclaim HKID. Something went wrong
+ * heavily with TDX module. Give up freeing TD pages. As the function
+ * already warned, don't warn it again.
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (kvm_tdx->td.tdcs_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (!kvm_tdx->td.tdcs_pages[i])
+ continue;
+
+ tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
+ }
+ kfree(kvm_tdx->td.tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+ }
+
+ if (!kvm_tdx->td.tdr_page)
+ return;
+
+ if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
+ return;
+
+ /*
+ * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+ * KeyID. TDX module may access TDR while operating on TD (Especially
+ * when it is reclaiming TDCS).
+ */
+ err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ return;
+ }
+ tdx_clear_page(kvm_tdx->td.tdr_page);
+
+ __free_page(kvm_tdx->td.tdr_page);
+ kvm_tdx->td.tdr_page = NULL;
+}
+
+void tdx_vm_destroy(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ tdx_reclaim_td_control_pages(kvm);
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+ struct kvm_tdx *kvm_tdx = param;
+ u64 err;
+
+ /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+ err = tdh_mng_key_config(&kvm_tdx->td);
+
+ if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+ pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int tdx_vm_init(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ kvm->arch.has_protected_state = true;
+ kvm->arch.has_private_mem = true;
+ kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
+ /*
+ * Because guest TD is protected, VMM can't parse the instruction in TD.
+ * Instead, guest uses MMIO hypercall. For unmodified device driver,
+ * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
+ * instruction into MMIO hypercall.
+ *
+ * SPTE value for MMIO needs to be setup so that #VE is injected into
+ * TD instead of triggering EPT MISCONFIG.
+ * - RWX=0 so that EPT violation is triggered.
+ * - suppress #VE bit is cleared to inject #VE.
+ */
+ kvm_mmu_set_mmio_spte_value(kvm, 0);
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. TDX module reports
+ * such limit via the MAX_VCPU_PER_TD global metadata. In
+ * practice, it reflects the number of logical CPUs that ALL
+ * platforms that the TDX module supports can possibly have.
+ *
+ * Limit TDX guest's maximum vCPUs to the number of logical CPUs
+ * the platform has. Simply forwarding the MAX_VCPU_PER_TD to
+ * userspace would result in an unpredictable ABI.
+ */
+ kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
+
+ kvm_tdx->state = TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (kvm_tdx->state != TD_STATE_INITIALIZED)
+ return -EIO;
+
+ /*
+ * TDX module mandates APICv, which requires an in-kernel local APIC.
+ * Disallow an in-kernel I/O APIC, because level-triggered interrupts
+ * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
+ */
+ if (!irqchip_split(vcpu->kvm))
+ return -EINVAL;
+
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+ vcpu->arch.apic->guest_apic_protected = true;
+ INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
+
+ vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+
+ vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
+ vcpu->arch.cr0_guest_owned_bits = -1ul;
+ vcpu->arch.cr4_guest_owned_bits = -1ul;
+
+ /* KVM can't change TSC offset/multiplier as TDX module manages them. */
+ vcpu->arch.guest_tsc_protected = true;
+ vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
+ vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+ vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+ vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+
+ vcpu->arch.guest_state_protected =
+ !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
+
+ if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
+ vcpu->arch.xfd_no_write_intercept = true;
+
+ tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&tdx->vt.pi_desc);
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ vmx_vcpu_pi_load(vcpu, cpu);
+ if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+ return;
+
+ tdx_flush_vp_on_cpu(vcpu);
+
+ KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
+ local_irq_disable();
+ /*
+ * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+ * vcpu->cpu is read before tdx->cpu_list.
+ */
+ smp_rmb();
+
+ list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+ local_irq_enable();
+}
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+ /*
+ * KVM can't get the interrupt status of TDX guest and it assumes
+ * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
+ * which passes the interrupt blocked flag.
+ */
+ return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ !to_tdx(vcpu)->vp_enter_args.r12;
+}
+
+bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+{
+ u64 vcpu_state_details;
+
+ if (pi_has_pending_interrupt(vcpu))
+ return true;
+
+ /*
+ * Only check RVI pending for HALTED case with IRQ enabled.
+ * For non-HLT cases, KVM doesn't care about STI/SS shadows. And if the
+ * interrupt was pending before TD exit, then it _must_ be blocked,
+ * otherwise the interrupt would have been serviced at the instruction
+ * boundary.
+ */
+ if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+ to_tdx(vcpu)->vp_enter_args.r12)
+ return false;
+
+ vcpu_state_details =
+ td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
+
+ return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
+}
+
+/*
+ * Compared to vmx_prepare_switch_to_guest(), there is not much to do
+ * as SEAMCALL/SEAMRET calls take care of most of save and restore.
+ */
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (vt->guest_state_loaded)
+ return;
+
+ if (likely(is_64bit_mm(current->mm)))
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
+ else
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+
+ vt->host_debugctlmsr = get_debugctlmsr();
+
+ vt->guest_state_loaded = true;
+}
+
+struct tdx_uret_msr {
+ u32 msr;
+ unsigned int slot;
+ u64 defval;
+};
+
+static struct tdx_uret_msr tdx_uret_msrs[] = {
+ {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
+ {.msr = MSR_STAR,},
+ {.msr = MSR_LSTAR,},
+ {.msr = MSR_TSC_AUX,},
+};
+
+static void tdx_user_return_msr_update_cache(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
+ kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
+ tdx_uret_msrs[i].defval);
+}
+
+static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (!vt->guest_state_loaded)
+ return;
+
+ ++vcpu->stat.host_state_reload;
+ wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
+
+ if (tdx->guest_entered) {
+ tdx_user_return_msr_update_cache();
+ tdx->guest_entered = false;
+ }
+
+ vt->guest_state_loaded = false;
+}
+
+void tdx_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ vmx_vcpu_pi_put(vcpu);
+ tdx_prepare_switch_to_host(vcpu);
+}
+
+void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int i;
+
+ /*
+ * It is not possible to reclaim pages while hkid is assigned. It might
+ * be assigned if:
+ * 1. the TD VM is being destroyed but freeing hkid failed, in which
+ * case the pages are leaked
+ * 2. TD VCPU creation failed and this on the error path, in which case
+ * there is nothing to do anyway
+ */
+ if (is_hkid_assigned(kvm_tdx))
+ return;
+
+ if (tdx->vp.tdcx_pages) {
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+ }
+ if (tdx->vp.tdvpr_page) {
+ tdx_reclaim_control_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = 0;
+ }
+
+ tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+}
+
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
+ to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+ return -EINVAL;
+
+ return 1;
+}
+
+static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case EXIT_REASON_CPUID:
+ case EXIT_REASON_HLT:
+ case EXIT_REASON_IO_INSTRUCTION:
+ case EXIT_REASON_MSR_READ:
+ case EXIT_REASON_MSR_WRITE:
+ return tdvmcall_leaf(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return EXIT_REASON_EPT_MISCONFIG;
+ default:
+ break;
+ }
+
+ return EXIT_REASON_TDCALL;
+}
+
+static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u32 exit_reason;
+
+ switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
+ case TDX_SUCCESS:
+ case TDX_NON_RECOVERABLE_VCPU:
+ case TDX_NON_RECOVERABLE_TD:
+ case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
+ case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
+ break;
+ default:
+ return -1u;
+ }
+
+ exit_reason = tdx->vp_enter_ret;
+
+ switch (exit_reason) {
+ case EXIT_REASON_TDCALL:
+ if (tdvmcall_exit_type(vcpu))
+ return EXIT_REASON_VMCALL;
+
+ return tdcall_to_vmx_exit_reason(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
+ * non-instrumentable code with interrupts disabled.
+ */
+ return -1u;
+ default:
+ break;
+ }
+
+ return exit_reason;
+}
+
+static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ guest_state_enter_irqoff();
+
+ tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
+
+ vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
+
+ vt->exit_qualification = tdx->vp_enter_args.rcx;
+ tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
+ tdx->exit_gpa = tdx->vp_enter_args.r8;
+ vt->exit_intr_info = tdx->vp_enter_args.r9;
+
+ vmx_handle_nmi(vcpu);
+
+ guest_state_exit_irqoff();
+}
+
+static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_exit_reason(vcpu).failed_vmentry &&
+ vmx_get_exit_reason(vcpu).full != -1u;
+}
+
+static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
+
+ /*
+ * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
+ * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
+ *
+ * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
+ * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
+ * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
+ * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
+ * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
+ * requester may be blocked endlessly.
+ */
+ if (unlikely(tdx_operand_busy(vp_enter_ret)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ return EXIT_FASTPATH_NONE;
+}
+
+#define TDX_REGS_AVAIL_SET (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
+ BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
+ BIT_ULL(VCPU_REGS_RAX) | \
+ BIT_ULL(VCPU_REGS_RBX) | \
+ BIT_ULL(VCPU_REGS_RCX) | \
+ BIT_ULL(VCPU_REGS_RDX) | \
+ BIT_ULL(VCPU_REGS_RBP) | \
+ BIT_ULL(VCPU_REGS_RSI) | \
+ BIT_ULL(VCPU_REGS_RDI) | \
+ BIT_ULL(VCPU_REGS_R8) | \
+ BIT_ULL(VCPU_REGS_R9) | \
+ BIT_ULL(VCPU_REGS_R10) | \
+ BIT_ULL(VCPU_REGS_R11) | \
+ BIT_ULL(VCPU_REGS_R12) | \
+ BIT_ULL(VCPU_REGS_R13) | \
+ BIT_ULL(VCPU_REGS_R14) | \
+ BIT_ULL(VCPU_REGS_R15))
+
+static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+
+ /*
+ * All TDX hosts support PKRU; but even if they didn't,
+ * vcpu->arch.host_pkru would be 0 and the wrpkru would be
+ * skipped.
+ */
+ if (vcpu->arch.host_pkru != 0)
+ wrpkru(vcpu->arch.host_pkru);
+
+ if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+
+ /*
+ * Likewise, even if a TDX hosts didn't support XSS both arms of
+ * the comparison would be 0 and the wrmsrl would be skipped.
+ */
+ if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
+ wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+}
+
+#define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
+ DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
+ DEBUGCTLMSR_FREEZE_IN_SMM)
+
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ /*
+ * force_immediate_exit requires vCPU entering for events injection with
+ * an immediately exit followed. But The TDX module doesn't guarantee
+ * entry, it's already possible for KVM to _think_ it completely entry
+ * to the guest without actually having done so.
+ * Since KVM never needs to force an immediate exit for TDX, and can't
+ * do direct injection, just warn on force_immediate_exit.
+ */
+ WARN_ON_ONCE(force_immediate_exit);
+
+ /*
+ * Wait until retry of SEPT-zap-related SEAMCALL completes before
+ * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+ * TDCALLs.
+ */
+ if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ trace_kvm_entry(vcpu, force_immediate_exit);
+
+ if (pi_test_on(&vt->pi_desc)) {
+ apic->send_IPI_self(POSTED_INTR_VECTOR);
+
+ if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
+ APIC_VECTOR_MASK, &vt->pi_desc))
+ kvm_wait_lapic_expire(vcpu);
+ }
+
+ tdx_vcpu_enter_exit(vcpu);
+
+ if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
+ update_debugctlmsr(vt->host_debugctlmsr);
+
+ tdx_load_host_xsave_state(vcpu);
+ tdx->guest_entered = true;
+
+ vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
+
+ if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
+ return EXIT_FASTPATH_NONE;
+
+ if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
+ kvm_machine_check();
+
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
+
+ if (unlikely(tdx_failed_vmentry(vcpu)))
+ return EXIT_FASTPATH_NONE;
+
+ return tdx_exit_handlers_fastpath(vcpu);
+}
+
+void tdx_inject_nmi(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.nmi_injections;
+ td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
+ /*
+ * From KVM's perspective, NMI injection is completed right after
+ * writing to PEND_NMI. KVM doesn't care whether an NMI is injected by
+ * the TDX module or not.
+ */
+ vcpu->arch.nmi_injected = false;
+ /*
+ * TDX doesn't support KVM to request NMI window exit. If there is
+ * still a pending vNMI, KVM is not able to inject it along with the
+ * one pending in TDX module in a back-to-back way. Since the previous
+ * vNMI is still pending in TDX module, i.e. it has not been delivered
+ * to TDX guest yet, it's OK to collapse the pending vNMI into the
+ * previous one. The guest is expected to handle all the NMI sources
+ * when handling the first vNMI.
+ */
+ vcpu->arch.nmi_pending = 0;
+}
+
+static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
+{
+ u32 intr_info = vmx_get_intr_info(vcpu);
+
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
+ * VM-Entry. NMIs are handled by tdx_vcpu_enter_exit().
+ */
+ if (is_nmi(intr_info) || is_machine_check(intr_info))
+ return 1;
+
+ vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
+ vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+ vcpu->run->ex.error_code = 0;
+
+ return 0;
+}
+
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
+ return 1;
+}
+
+static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
+{
+ kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
+ kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
+ kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
+ kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
+ kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
+
+ return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
+}
+
+/*
+ * Split into chunks and check interrupt pending between chunks. This allows
+ * for timely injection of interrupts to prevent issues with guest lockup
+ * detection.
+ */
+#define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
+static void __tdx_map_gpa(struct vcpu_tdx *tdx);
+
+static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (vcpu->run->hypercall.ret) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
+ if (tdx->map_gpa_next >= tdx->map_gpa_end)
+ return 1;
+
+ /*
+ * Stop processing the remaining part if there is a pending interrupt,
+ * which could be qualified to deliver. Skip checking pending RVI for
+ * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
+ */
+ if (kvm_vcpu_has_events(vcpu)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
+ tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+ return 1;
+ }
+
+ __tdx_map_gpa(tdx);
+ return 0;
+}
+
+static void __tdx_map_gpa(struct vcpu_tdx *tdx)
+{
+ u64 gpa = tdx->map_gpa_next;
+ u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
+
+ if (size > TDX_MAP_GPA_MAX_LEN)
+ size = TDX_MAP_GPA_MAX_LEN;
+
+ tdx->vcpu.run->exit_reason = KVM_EXIT_HYPERCALL;
+ tdx->vcpu.run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ tdx->vcpu.run->hypercall.ret = 0;
+ tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+ tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
+ tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
+ KVM_MAP_GPA_RANGE_ENCRYPTED :
+ KVM_MAP_GPA_RANGE_DECRYPTED;
+ tdx->vcpu.run->hypercall.flags = KVM_EXIT_HYPERCALL_LONG_MODE;
+
+ tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
+}
+
+static int tdx_map_gpa(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 gpa = tdx->vp_enter_args.r12;
+ u64 size = tdx->vp_enter_args.r13;
+ u64 ret;
+
+ /*
+ * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
+ * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
+ * bit set. If not, the error code is not defined in GHCI for TDX, use
+ * TDVMCALL_STATUS_INVALID_OPERAND for this case.
+ */
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+ ret = TDVMCALL_STATUS_INVALID_OPERAND;
+ goto error;
+ }
+
+ if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
+ !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
+ (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
+ ret = TDVMCALL_STATUS_INVALID_OPERAND;
+ goto error;
+ }
+
+ if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
+ ret = TDVMCALL_STATUS_ALIGN_ERROR;
+ goto error;
+ }
+
+ tdx->map_gpa_end = gpa + size;
+ tdx->map_gpa_next = gpa;
+
+ __tdx_map_gpa(tdx);
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, ret);
+ tdx->vp_enter_args.r11 = gpa;
+ return 1;
+}
+
+static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 *regs = vcpu->run->system_event.data;
+ u64 *module_regs = &tdx->vp_enter_args.r8;
+ int index = VCPU_REGS_RAX;
+
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
+ vcpu->run->system_event.ndata = 16;
+
+ /* Dump 16 general-purpose registers to userspace in ascending order. */
+ regs[index++] = tdx->vp_enter_ret;
+ regs[index++] = tdx->vp_enter_args.rcx;
+ regs[index++] = tdx->vp_enter_args.rdx;
+ regs[index++] = tdx->vp_enter_args.rbx;
+ regs[index++] = 0;
+ regs[index++] = 0;
+ regs[index++] = tdx->vp_enter_args.rsi;
+ regs[index] = tdx->vp_enter_args.rdi;
+ for (index = 0; index < 8; index++)
+ regs[VCPU_REGS_R8 + index] = module_regs[index];
+
+ return 0;
+}
+
+static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 eax, ebx, ecx, edx;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* EAX and ECX for cpuid is stored in R12 and R13. */
+ eax = tdx->vp_enter_args.r12;
+ ecx = tdx->vp_enter_args.r13;
+
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+
+ tdx->vp_enter_args.r12 = eax;
+ tdx->vp_enter_args.r13 = ebx;
+ tdx->vp_enter_args.r14 = ecx;
+ tdx->vp_enter_args.r15 = edx;
+
+ return 1;
+}
+
+static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.pio.count = 0;
+ return 1;
+}
+
+static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ int ret;
+
+ ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+
+ WARN_ON_ONCE(!ret);
+
+ tdvmcall_set_return_val(vcpu, val);
+
+ return 1;
+}
+
+static int tdx_emulate_io(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ unsigned long val = 0;
+ unsigned int port;
+ u64 size, write;
+ int ret;
+
+ ++vcpu->stat.io_exits;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ port = tdx->vp_enter_args.r14;
+
+ if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (write) {
+ val = tdx->vp_enter_args.r15;
+ ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
+ } else {
+ ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
+ }
+
+ if (!ret)
+ vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
+ tdx_complete_pio_in;
+ else if (!write)
+ tdvmcall_set_return_val(vcpu, val);
+
+ return ret;
+}
+
+static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
+{
+ unsigned long val = 0;
+ gpa_t gpa;
+ int size;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+ size = vcpu->mmio_fragments[0].len;
+
+ memcpy(&val, vcpu->run->mmio.data, size);
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 1;
+}
+
+static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
+ unsigned long val)
+{
+ if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+ trace_kvm_fast_mmio(gpa);
+ return 0;
+ }
+
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
+ if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
+{
+ unsigned long val;
+
+ if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+ return -EOPNOTSUPP;
+
+ tdvmcall_set_return_val(vcpu, val);
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+ return 0;
+}
+
+static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int size, write, r;
+ unsigned long val;
+ gpa_t gpa;
+
+ size = tdx->vp_enter_args.r12;
+ write = tdx->vp_enter_args.r13;
+ gpa = tdx->vp_enter_args.r14;
+ val = write ? tdx->vp_enter_args.r15 : 0;
+
+ if (size != 1 && size != 2 && size != 4 && size != 8)
+ goto error;
+ if (write != 0 && write != 1)
+ goto error;
+
+ /*
+ * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
+ * do MMIO emulation for private GPA.
+ */
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
+ vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
+ goto error;
+
+ gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+
+ if (write)
+ r = tdx_mmio_write(vcpu, gpa, size, val);
+ else
+ r = tdx_mmio_read(vcpu, gpa, size);
+ if (!r)
+ /* Kernel completed device emulation. */
+ return 1;
+
+ /* Request the device emulation to userspace device model. */
+ vcpu->mmio_is_write = write;
+ if (!write)
+ vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
+
+ vcpu->run->mmio.phys_addr = gpa;
+ vcpu->run->mmio.len = size;
+ vcpu->run->mmio.is_write = write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+
+ if (write) {
+ memcpy(vcpu->run->mmio.data, &val, size);
+ } else {
+ vcpu->mmio_fragments[0].gpa = gpa;
+ vcpu->mmio_fragments[0].len = size;
+ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
+ }
+ return 0;
+
+error:
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+}
+
+static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ if (tdx->vp_enter_args.r12)
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ else {
+ tdx->vp_enter_args.r11 = 0;
+ tdx->vp_enter_args.r13 = 0;
+ tdx->vp_enter_args.r14 = 0;
+ }
+ return 1;
+}
+
+static int handle_tdvmcall(struct kvm_vcpu *vcpu)
+{
+ switch (tdvmcall_leaf(vcpu)) {
+ case TDVMCALL_MAP_GPA:
+ return tdx_map_gpa(vcpu);
+ case TDVMCALL_REPORT_FATAL_ERROR:
+ return tdx_report_fatal_error(vcpu);
+ case TDVMCALL_GET_TD_VM_CALL_INFO:
+ return tdx_get_td_vm_call_info(vcpu);
+ default:
+ break;
+ }
+
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+}
+
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
+{
+ u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
+ TDX_SHARED_BIT_PWL_4;
+
+ if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
+ return;
+
+ td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
+}
+
+static void tdx_unpin(struct kvm *kvm, struct page *page)
+{
+ put_page(page);
+}
+
+static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 entry, level_state;
+ u64 err;
+
+ err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+ if (unlikely(tdx_operand_busy(err))) {
+ tdx_unpin(kvm, page);
+ return -EBUSY;
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
+ tdx_unpin(kvm, page);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
+ * callback tdx_gmem_post_populate() then maps pages into private memory.
+ * through the a seamcall TDH.MEM.PAGE.ADD(). The SEAMCALL also requires the
+ * private EPT structures for the page to have been built before, which is
+ * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
+ * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
+ * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
+ * are no half-initialized shared EPT pages.
+ */
+static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
+ return -EINVAL;
+
+ /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
+ atomic64_inc(&kvm_tdx->nr_premapped);
+ return 0;
+}
+
+int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct page *page = pfn_to_page(pfn);
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return -EINVAL;
+
+ /*
+ * Because guest_memfd doesn't support page migration with
+ * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
+ * migration. Until guest_memfd supports page migration, prevent page
+ * migration.
+ * TODO: Once guest_memfd introduces callback on page migration,
+ * implement it and remove get_page/put_page().
+ */
+ get_page(page);
+
+ /*
+ * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
+ * barrier in tdx_td_finalize().
+ */
+ smp_rmb();
+ if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
+ return tdx_mem_page_aug(kvm, gfn, level, page);
+
+ return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
+}
+
+static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u64 err, entry, level_state;
+
+ /* TODO: handle large pages. */
+ if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+ return -EINVAL;
+
+ if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
+ return -EINVAL;
+
+ /*
+ * When zapping private page, write lock is held. So no race condition
+ * with other vcpu sept operation.
+ * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+ */
+ err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+ &level_state);
+
+ if (unlikely(tdx_operand_busy(err))) {
+ /*
+ * The second retry is expected to succeed after kicking off all
+ * other vCPUs and prevent them from invoking TDH.VP.ENTER.
+ */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+ &level_state);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
+ return -EIO;
+ }
+
+ err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+ return -EIO;
+ }
+ tdx_clear_page(page);
+ tdx_unpin(kvm, page);
+ return 0;
+}
+
+int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ gpa_t gpa = gfn_to_gpa(gfn);
+ struct page *page = virt_to_page(private_spt);
+ u64 err, entry, level_state;
+
+ err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
+ &level_state);
+ if (unlikely(tdx_operand_busy(err)))
+ return -EBUSY;
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
+ * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
+ * successfully.
+ *
+ * Since tdh_mem_sept_add() must have been invoked successfully before a
+ * non-leaf entry present in the mirrored page table, the SEPT ZAP related
+ * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
+ * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
+ * SEPT.
+ *
+ * Further check if the returned entry from SEPT walking is with RWX permissions
+ * to filter out anything unexpected.
+ *
+ * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
+ * level_state returned from a SEAMCALL error is the same as that passed into
+ * the SEAMCALL.
+ */
+static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
+ u64 entry, int level)
+{
+ if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return false;
+
+ if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
+ return false;
+
+ if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
+ return false;
+
+ return true;
+}
+
+static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, struct page *page)
+{
+ int tdx_level = pg_level_to_tdx_sept_level(level);
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
+ u64 err, entry, level_state;
+
+ /* For now large page isn't supported yet. */
+ WARN_ON_ONCE(level != PG_LEVEL_4K);
+
+ err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+
+ if (unlikely(tdx_operand_busy(err))) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+ if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
+ !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
+ atomic64_dec(&kvm_tdx->nr_premapped);
+ tdx_unpin(kvm, page);
+ return 0;
+ }
+
+ if (KVM_BUG_ON(err, kvm)) {
+ pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
+ return -EIO;
+ }
+ return 1;
+}
+
+/*
+ * Ensure shared and private EPTs to be flushed on all vCPUs.
+ * tdh_mem_track() is the only caller that increases TD epoch. An increase in
+ * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
+ * running in guest mode with the value "N - 1".
+ *
+ * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
+ * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
+ * being increased to "N + 1".
+ *
+ * Kicking off all vCPUs after that further results in no vCPUs can run in guest
+ * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
+ * to increase TD epoch to "N + 2").
+ *
+ * TDX module will flush EPT on the next TD enter and make vCPUs to run in
+ * guest mode with TD epoch value "N + 1".
+ *
+ * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
+ * waiting empty IPI handler ack_kick().
+ *
+ * No action is required to the vCPUs being kicked off since the kicking off
+ * occurs certainly after TD epoch increment and before the next
+ * tdh_mem_track().
+ */
+static void tdx_track(struct kvm *kvm)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ u64 err;
+
+ /* If TD isn't finalized, it's before any vcpu running. */
+ if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+ return;
+
+ lockdep_assert_held_write(&kvm->mmu_lock);
+
+ err = tdh_mem_track(&kvm_tdx->td);
+ if (unlikely(tdx_operand_busy(err))) {
+ /* After no vCPUs enter, the second retry is expected to succeed */
+ tdx_no_vcpus_enter_start(kvm);
+ err = tdh_mem_track(&kvm_tdx->td);
+ tdx_no_vcpus_enter_stop(kvm);
+ }
+
+ if (KVM_BUG_ON(err, kvm))
+ pr_tdx_error(TDH_MEM_TRACK, err);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+}
+
+int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ /*
+ * free_external_spt() is only called after hkid is freed when TD is
+ * tearing down.
+ * KVM doesn't (yet) zap page table pages in mirror page table while
+ * TD is active, though guest pages mapped in mirror page table could be
+ * zapped during TD is active, e.g. for shared <-> private conversion
+ * and slot move/deletion.
+ */
+ if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
+ return -EINVAL;
+
+ /*
+ * The HKID assigned to this TD was already freed and cache was
+ * already flushed. We don't have to flush again.
+ */
+ return tdx_reclaim_page(virt_to_page(private_spt));
+}
+
+int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn)
+{
+ struct page *page = pfn_to_page(pfn);
+ int ret;
+
+ /*
+ * HKID is released after all private pages have been removed, and set
+ * before any might be populated. Warn if zapping is attempted when
+ * there can't be anything populated in the private EPT.
+ */
+ if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
+ return -EINVAL;
+
+ ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * TDX requires TLB tracking before dropping private page. Do
+ * it here, although it is also done later.
+ */
+ tdx_track(kvm);
+
+ return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+}
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ /* TDX supports only posted interrupt. No lapic emulation. */
+ __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
+
+ trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+}
+
+static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
+{
+ u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
+ u64 eq = vmx_get_exit_qual(vcpu);
+
+ if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
+ return false;
+
+ return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+}
+
+static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qual;
+ gpa_t gpa = to_tdx(vcpu)->exit_gpa;
+ bool local_retry = false;
+ int ret;
+
+ if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+ if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
+ pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
+ gpa, vcpu->vcpu_id);
+ kvm_vm_dead(vcpu->kvm);
+ return -EIO;
+ }
+ /*
+ * Always treat SEPT violations as write faults. Ignore the
+ * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
+ * TD private pages are always RWX in the SEPT tables,
+ * i.e. they're always mapped writable. Just as importantly,
+ * treating SEPT violations as write faults is necessary to
+ * avoid COW allocations, which will cause TDAUGPAGE failures
+ * due to aliasing a single HPA to multiple GPAs.
+ */
+ exit_qual = EPT_VIOLATION_ACC_WRITE;
+
+ /* Only private GPA triggers zero-step mitigation */
+ local_retry = true;
+ } else {
+ exit_qual = vmx_get_exit_qual(vcpu);
+ /*
+ * EPT violation due to instruction fetch should never be
+ * triggered from shared memory in TDX guest. If such EPT
+ * violation occurs, treat it as broken hardware.
+ */
+ if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
+ return -EIO;
+ }
+
+ trace_kvm_page_fault(vcpu, gpa, exit_qual);
+
+ /*
+ * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
+ * mapping in TDX.
+ *
+ * KVM may return RET_PF_RETRY for private GPA due to
+ * - contentions when atomically updating SPTEs of the mirror page table
+ * - in-progress GFN invalidation or memslot removal.
+ * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
+ * caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
+ * or certain TDCALLs.
+ *
+ * If TDH.VP.ENTER is invoked more times than the threshold set by the
+ * TDX module before KVM resolves the private GPA mapping, the TDX
+ * module will activate zero-step mitigation during TDH.VP.ENTER. This
+ * process acquires an SEPT tree lock in the TDX module, leading to
+ * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
+ * operations on other vCPUs.
+ *
+ * Breaking out of local retries for kvm_vcpu_has_events() is for
+ * interrupt injection. kvm_vcpu_has_events() should not see pending
+ * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
+ * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
+ * the guest even if the IRQ/NMI can't be delivered.
+ *
+ * Note: even without breaking out of local retries, zero-step
+ * mitigation may still occur due to
+ * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
+ * - a single RIP causing EPT violations for more GFNs than the
+ * threshold count.
+ * This is safe, as triggering zero-step mitigation only introduces
+ * contentions to page installation SEAMCALLs on other vCPUs, which will
+ * handle retries locally in their EPT violation handlers.
+ */
+ while (1) {
+ ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
+
+ if (ret != RET_PF_RETRY || !local_retry)
+ break;
+
+ if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
+ break;
+
+ if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+ ret = -EIO;
+ break;
+ }
+
+ cond_resched();
+ }
+ return ret;
+}
+
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+{
+ if (err) {
+ tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+ return 1;
+ }
+
+ if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
+ tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
+
+ return 1;
+}
+
+
+int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ u64 vp_enter_ret = tdx->vp_enter_ret;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+
+ if (fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
+ KVM_BUG_ON(1, vcpu->kvm);
+ return -EIO;
+ }
+
+ /*
+ * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
+ * TDX_SEAMCALL_VMFAILINVALID.
+ */
+ if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
+ KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
+ goto unhandled_exit;
+ }
+
+ if (unlikely(tdx_failed_vmentry(vcpu))) {
+ /*
+ * If the guest state is protected, that means off-TD debug is
+ * not enabled, TDX_NON_RECOVERABLE must be set.
+ */
+ WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
+ !(vp_enter_ret & TDX_NON_RECOVERABLE));
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+ return 0;
+ }
+
+ if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
+ exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
+ kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
+ goto unhandled_exit;
+ }
+
+ WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
+ (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
+
+ switch (exit_reason.basic) {
+ case EXIT_REASON_TRIPLE_FAULT:
+ vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+ vcpu->mmio_needed = 0;
+ return 0;
+ case EXIT_REASON_EXCEPTION_NMI:
+ return tdx_handle_exception_nmi(vcpu);
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ ++vcpu->stat.irq_exits;
+ return 1;
+ case EXIT_REASON_CPUID:
+ return tdx_emulate_cpuid(vcpu);
+ case EXIT_REASON_HLT:
+ return kvm_emulate_halt_noskip(vcpu);
+ case EXIT_REASON_TDCALL:
+ return handle_tdvmcall(vcpu);
+ case EXIT_REASON_VMCALL:
+ return tdx_emulate_vmcall(vcpu);
+ case EXIT_REASON_IO_INSTRUCTION:
+ return tdx_emulate_io(vcpu);
+ case EXIT_REASON_MSR_READ:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ return kvm_emulate_rdmsr(vcpu);
+ case EXIT_REASON_MSR_WRITE:
+ kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+ kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
+ kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
+ return kvm_emulate_wrmsr(vcpu);
+ case EXIT_REASON_EPT_MISCONFIG:
+ return tdx_emulate_mmio(vcpu);
+ case EXIT_REASON_EPT_VIOLATION:
+ return tdx_handle_ept_violation(vcpu);
+ case EXIT_REASON_OTHER_SMI:
+ /*
+ * Unlike VMX, SMI in SEAM non-root mode (i.e. when
+ * TD guest vCPU is running) will cause VM exit to TDX module,
+ * then SEAMRET to KVM. Once it exits to KVM, SMI is delivered
+ * and handled by kernel handler right away.
+ *
+ * The Other SMI exit can also be caused by the SEAM non-root
+ * machine check delivered via Machine Check System Management
+ * Interrupt (MSMI), but it has already been handled by the
+ * kernel machine check handler, i.e., the memory page has been
+ * marked as poisoned and it won't be freed to the free list
+ * when the TDX guest is terminated (the TDX module marks the
+ * guest as dead and prevent it from further running when
+ * machine check happens in SEAM non-root).
+ *
+ * - A MSMI will not reach here, it's handled as non_recoverable
+ * case above.
+ * - If it's not an MSMI, no need to do anything here.
+ */
+ return 1;
+ default:
+ break;
+ }
+
+unhandled_exit:
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vp_enter_ret;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
+}
+
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+ *reason = tdx->vt.exit_reason.full;
+ if (*reason != -1u) {
+ *info1 = vmx_get_exit_qual(vcpu);
+ *info2 = tdx->ext_exit_qualification;
+ *intr_info = vmx_get_intr_info(vcpu);
+ } else {
+ *info1 = 0;
+ *info2 = 0;
+ *intr_info = 0;
+ }
+
+ *error_code = 0;
+}
+
+bool tdx_has_emulated_msr(u32 index)
+{
+ switch (index) {
+ case MSR_IA32_UCODE_REV:
+ case MSR_IA32_ARCH_CAPABILITIES:
+ case MSR_IA32_POWER_CTL:
+ case MSR_IA32_CR_PAT:
+ case MSR_MTRRcap:
+ case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_TSC_DEADLINE:
+ case MSR_IA32_MISC_ENABLE:
+ case MSR_PLATFORM_INFO:
+ case MSR_MISC_FEATURES_ENABLES:
+ case MSR_IA32_APICBASE:
+ case MSR_EFER:
+ case MSR_IA32_FEAT_CTL:
+ case MSR_IA32_MCG_CAP:
+ case MSR_IA32_MCG_STATUS:
+ case MSR_IA32_MCG_CTL:
+ case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+ case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+ /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
+ case MSR_KVM_POLL_CONTROL:
+ return true;
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+ /*
+ * x2APIC registers that are virtualized by the CPU can't be
+ * emulated, KVM doesn't have access to the virtual APIC page.
+ */
+ switch (index) {
+ case X2APIC_MSR(APIC_TASKPRI):
+ case X2APIC_MSR(APIC_PROCPRI):
+ case X2APIC_MSR(APIC_EOI):
+ case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
+ case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
+ return false;
+ default:
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+static bool tdx_is_read_only_msr(u32 index)
+{
+ return index == MSR_IA32_APICBASE || index == MSR_EFER ||
+ index == MSR_IA32_FEAT_CTL;
+}
+
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_FEAT_CTL:
+ /*
+ * MCE and MCA are advertised via cpuid. Guest kernel could
+ * check if LMCE is enabled or not.
+ */
+ msr->data = FEAT_CTL_LOCKED;
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ msr->data |= FEAT_CTL_LMCE_ENABLED;
+ return 0;
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
+ return 1;
+ msr->data = vcpu->arch.mcg_ext_ctl;
+ return 0;
+ default:
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_get_msr_common(vcpu, msr);
+ }
+}
+
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+{
+ switch (msr->index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
+ (msr->data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = msr->data;
+ return 0;
+ default:
+ if (tdx_is_read_only_msr(msr->index))
+ return 1;
+
+ if (!tdx_has_emulated_msr(msr->index))
+ return 1;
+
+ return kvm_set_msr_common(vcpu, msr);
+ }
+}
+
+static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_tdx_capabilities __user *user_caps;
+ struct kvm_tdx_capabilities *caps = NULL;
+ int ret = 0;
+
+ /* flags is reserved for future use */
+ if (cmd->flags)
+ return -EINVAL;
+
+ caps = kmalloc(sizeof(*caps) +
+ sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
+ GFP_KERNEL);
+ if (!caps)
+ return -ENOMEM;
+
+ user_caps = u64_to_user_ptr(cmd->data);
+ if (copy_from_user(caps, user_caps, sizeof(*caps))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (caps->cpuid.nent < td_conf->num_cpuid_config) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ ret = init_kvm_tdx_caps(td_conf, caps);
+ if (ret)
+ goto out;
+
+ if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
+ caps->cpuid.nent *
+ sizeof(caps->cpuid.entries[0])))
+ ret = -EFAULT;
+
+out:
+ /* kfree() accepts NULL. */
+ kfree(caps);
+ return ret;
+}
+
+/*
+ * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
+ * similar to TDX's GPAW. Use this field as the interface for userspace to
+ * configure the GPAW and EPT level for TDs.
+ *
+ * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
+ * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
+ * supported. Value 52 is only supported when the platform supports 5 level
+ * EPT.
+ */
+static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct kvm_cpuid_entry2 *entry;
+ int guest_pa;
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
+ if (!entry)
+ return -EINVAL;
+
+ guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
+
+ if (guest_pa != 48 && guest_pa != 52)
+ return -EINVAL;
+
+ if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
+ return -EINVAL;
+
+ td_params->eptp_controls = VMX_EPTP_MT_WB;
+ if (guest_pa == 52) {
+ td_params->eptp_controls |= VMX_EPTP_PWL_5;
+ td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
+ } else {
+ td_params->eptp_controls |= VMX_EPTP_PWL_4;
+ }
+
+ return 0;
+}
+
+static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+ struct td_params *td_params)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ const struct kvm_cpuid_entry2 *entry;
+ struct tdx_cpuid_value *value;
+ int i, copy_cnt = 0;
+
+ /*
+ * td_params.cpuid_values: The number and the order of cpuid_value must
+ * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
+ * It's assumed that td_params was zeroed.
+ */
+ for (i = 0; i < td_conf->num_cpuid_config; i++) {
+ struct kvm_cpuid_entry2 tmp;
+
+ td_init_cpuid_entry2(&tmp, i);
+
+ entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
+ tmp.function, tmp.index);
+ if (!entry)
+ continue;
+
+ if (tdx_unsupported_cpuid(entry))
+ return -EINVAL;
+
+ copy_cnt++;
+
+ value = &td_params->cpuid_values[i];
+ value->eax = entry->eax;
+ value->ebx = entry->ebx;
+ value->ecx = entry->ecx;
+ value->edx = entry->edx;
+
+ /*
+ * TDX module does not accept nonzero bits 16..23 for the
+ * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
+ */
+ if (tmp.function == 0x80000008)
+ value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
+ }
+
+ /*
+ * Rely on the TDX module to reject invalid configuration, but it can't
+ * check of leafs that don't have a proper slot in td_params->cpuid_values
+ * to stick then. So fail if there were entries that didn't get copied to
+ * td_params.
+ */
+ if (copy_cnt != cpuid->nent)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+ struct kvm_tdx_init_vm *init_vm)
+{
+ const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+ struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+ int ret;
+
+ if (kvm->created_vcpus)
+ return -EBUSY;
+
+ if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
+ return -EINVAL;
+
+ if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
+ return -EINVAL;
+
+ td_params->max_vcpus = kvm->max_vcpus;
+ td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
+ td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
+
+ td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
+ td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+
+ ret = setup_tdparams_eptp_controls(cpuid, td_params);
+ if (ret)
+ return ret;
+
+ ret = setup_tdparams_cpuids(cpuid, td_params);
+ if (ret)
+ return ret;
+
+#define MEMCPY_SAME_SIZE(dst, src) \
+ do { \
+ BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
+ memcpy((dst), (src), sizeof(dst)); \
+ } while (0)
+
+ MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+ MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+ MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+
+ return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
+ u64 *seamcall_err)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ cpumask_var_t packages;
+ struct page **tdcs_pages = NULL;
+ struct page *tdr_page;
+ int ret, i;
+ u64 err, rcx;
+
+ *seamcall_err = 0;
+ ret = tdx_guest_keyid_alloc();
+ if (ret < 0)
+ return ret;
+ kvm_tdx->hkid = ret;
+ kvm_tdx->misc_cg = get_current_misc_cg();
+ ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+ if (ret)
+ goto free_hkid;
+
+ ret = -ENOMEM;
+
+ atomic_inc(&nr_configured_hkid);
+
+ tdr_page = alloc_page(GFP_KERNEL);
+ if (!tdr_page)
+ goto free_hkid;
+
+ kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+ /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
+ kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
+ tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!tdcs_pages)
+ goto free_tdr;
+
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ tdcs_pages[i] = alloc_page(GFP_KERNEL);
+ if (!tdcs_pages[i])
+ goto free_tdcs;
+ }
+
+ if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+ goto free_tdcs;
+
+ cpus_read_lock();
+
+ /*
+ * Need at least one CPU of the package to be online in order to
+ * program all packages for host key id. Check it.
+ */
+ for_each_present_cpu(i)
+ cpumask_set_cpu(topology_physical_package_id(i), packages);
+ for_each_online_cpu(i)
+ cpumask_clear_cpu(topology_physical_package_id(i), packages);
+ if (!cpumask_empty(packages)) {
+ ret = -EIO;
+ /*
+ * Because it's hard for human operator to figure out the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+ pr_warn_ratelimited(MSG_ALLPKG);
+ goto free_packages;
+ }
+
+ /*
+ * TDH.MNG.CREATE tries to grab the global TDX module and fails
+ * with TDX_OPERAND_BUSY when it fails to grab. Take the global
+ * lock to prevent it from failure.
+ */
+ mutex_lock(&tdx_lock);
+ kvm_tdx->td.tdr_page = tdr_page;
+ err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
+ mutex_unlock(&tdx_lock);
+
+ if (err == TDX_RND_NO_ENTROPY) {
+ ret = -EAGAIN;
+ goto free_packages;
+ }
+
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_CREATE, err);
+ ret = -EIO;
+ goto free_packages;
+ }
+
+ for_each_online_cpu(i) {
+ int pkg = topology_physical_package_id(i);
+
+ if (cpumask_test_and_set_cpu(pkg, packages))
+ continue;
+
+ /*
+ * Program the memory controller in the package with an
+ * encryption key associated to a TDX private host key id
+ * assigned to this TDR. Concurrent operations on same memory
+ * controller results in TDX_OPERAND_BUSY. No locking needed
+ * beyond the cpus_read_lock() above as it serializes against
+ * hotplug and the first online CPU of the package is always
+ * used. We never have two CPUs in the same socket trying to
+ * program the key.
+ */
+ ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+ kvm_tdx, true);
+ if (ret)
+ break;
+ }
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+ if (ret) {
+ i = 0;
+ goto teardown;
+ }
+
+ kvm_tdx->td.tdcs_pages = tdcs_pages;
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
+ if (err == TDX_RND_NO_ENTROPY) {
+ /* Here it's hard to allow userspace to retry. */
+ ret = -EAGAIN;
+ goto teardown;
+ }
+ if (WARN_ON_ONCE(err)) {
+ pr_tdx_error(TDH_MNG_ADDCX, err);
+ ret = -EIO;
+ goto teardown;
+ }
+ }
+
+ err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
+ if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
+ /*
+ * Because a user gives operands, don't warn.
+ * Return a hint to the user because it's sometimes hard for the
+ * user to figure out which operand is invalid. SEAMCALL status
+ * code includes which operand caused invalid operand error.
+ */
+ *seamcall_err = err;
+ ret = -EINVAL;
+ goto teardown;
+ } else if (WARN_ON_ONCE(err)) {
+ pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+ ret = -EIO;
+ goto teardown;
+ }
+
+ return 0;
+
+ /*
+ * The sequence for freeing resources from a partially initialized TD
+ * varies based on where in the initialization flow failure occurred.
+ * Simply use the full teardown and destroy, which naturally play nice
+ * with partial initialization.
+ */
+teardown:
+ /* Only free pages not yet added, so start at 'i' */
+ for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i]) {
+ __free_page(tdcs_pages[i]);
+ tdcs_pages[i] = NULL;
+ }
+ }
+ if (!kvm_tdx->td.tdcs_pages)
+ kfree(tdcs_pages);
+
+ tdx_mmu_release_hkid(kvm);
+ tdx_reclaim_td_control_pages(kvm);
+
+ return ret;
+
+free_packages:
+ cpus_read_unlock();
+ free_cpumask_var(packages);
+
+free_tdcs:
+ for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+ if (tdcs_pages[i])
+ __free_page(tdcs_pages[i]);
+ }
+ kfree(tdcs_pages);
+ kvm_tdx->td.tdcs_pages = NULL;
+
+free_tdr:
+ if (tdr_page)
+ __free_page(tdr_page);
+ kvm_tdx->td.tdr_page = 0;
+
+free_hkid:
+ tdx_hkid_free(kvm_tdx);
+
+ return ret;
+}
+
+static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
+ u64 *data)
+{
+ u64 err;
+
+ err = tdh_mng_rd(&tdx->td, field_id, data);
+
+ return err;
+}
+
+#define TDX_MD_UNREADABLE_LEAF_MASK GENMASK(30, 7)
+#define TDX_MD_UNREADABLE_SUBLEAF_MASK GENMASK(31, 7)
+
+static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
+ bool sub_leaf_set, int *entry_index,
+ struct kvm_cpuid_entry2 *out)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
+ u64 ebx_eax, edx_ecx;
+ u64 err = 0;
+
+ if (sub_leaf > 0b1111111)
+ return -EINVAL;
+
+ if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
+ return -EINVAL;
+
+ if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
+ sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
+ return -EINVAL;
+
+ /*
+ * bit 23:17, REVSERVED: reserved, must be 0;
+ * bit 16, LEAF_31: leaf number bit 31;
+ * bit 15:9, LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
+ * implicitly 0;
+ * bit 8, SUBLEAF_NA: sub-leaf not applicable flag;
+ * bit 7:1, SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
+ * the SUBLEAF_6_0 is all-1.
+ * sub-leaf bits 31:7 are implicitly 0;
+ * bit 0, ELEMENT_I: Element index within field;
+ */
+ field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
+ field_id |= (leaf & 0x7f) << 9;
+ if (sub_leaf_set)
+ field_id |= (sub_leaf & 0x7f) << 1;
+ else
+ field_id |= 0x1fe;
+
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
+ if (err) //TODO check for specific errors
+ goto err_out;
+
+ out->eax = (u32) ebx_eax;
+ out->ebx = (u32) (ebx_eax >> 32);
+
+ field_id++;
+ err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
+ /*
+ * It's weird that reading edx_ecx fails while reading ebx_eax
+ * succeeded.
+ */
+ if (WARN_ON_ONCE(err))
+ goto err_out;
+
+ out->ecx = (u32) edx_ecx;
+ out->edx = (u32) (edx_ecx >> 32);
+
+ out->function = leaf;
+ out->index = sub_leaf;
+ out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
+
+ /*
+ * Work around missing support on old TDX modules, fetch
+ * guest maxpa from gfn_direct_bits.
+ */
+ if (leaf == 0x80000008) {
+ gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+ unsigned int g_maxpa = __ffs(gpa_bits) + 1;
+
+ out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
+ }
+
+ (*entry_index)++;
+
+ return 0;
+
+err_out:
+ out->eax = 0;
+ out->ebx = 0;
+ out->ecx = 0;
+ out->edx = 0;
+
+ return -EIO;
+}
+
+static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_vm *init_vm;
+ struct td_params *td_params = NULL;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
+ BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+
+ if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ init_vm = kmalloc(sizeof(*init_vm) +
+ sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
+ GFP_KERNEL);
+ if (!init_vm)
+ return -ENOMEM;
+
+ if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
+ ret = -E2BIG;
+ goto out;
+ }
+
+ if (copy_from_user(init_vm->cpuid.entries,
+ u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
+ flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (init_vm->cpuid.padding) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+ if (!td_params) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = setup_tdparams(kvm, td_params, init_vm);
+ if (ret)
+ goto out;
+
+ ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
+ if (ret)
+ goto out;
+
+ kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+ kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
+ kvm_tdx->attributes = td_params->attributes;
+ kvm_tdx->xfam = td_params->xfam;
+
+ if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
+ else
+ kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
+
+ kvm_tdx->state = TD_STATE_INITIALIZED;
+out:
+ /* kfree() accepts NULL. */
+ kfree(init_vm);
+ kfree(td_params);
+
+ return ret;
+}
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ /*
+ * flush_tlb_current() is invoked when the first time for the vcpu to
+ * run or when root of shared EPT is invalidated.
+ * KVM only needs to flush shared EPT because the TDX module handles TLB
+ * invalidation for private EPT in tdh_vp_enter();
+ *
+ * A single context invalidation for shared EPT can be performed here.
+ * However, this single context invalidation requires the private EPTP
+ * rather than the shared EPTP to flush shared EPT, as shared EPT uses
+ * private EPTP as its ASID for TLB invalidation.
+ *
+ * To avoid reading back private EPTP, perform a global invalidation for
+ * shared EPT instead to keep this function simple.
+ */
+ ept_sync_global();
+}
+
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+ * ensure that private EPT will be flushed on the next TD enter. No need
+ * to call tdx_track() here again even when this callback is a result of
+ * zapping private EPT.
+ *
+ * Due to the lack of the context to determine which EPT has been
+ * affected by zapping, invoke invept() directly here for both shared
+ * EPT and private EPT for simplicity, though it's not necessary for
+ * private EPT.
+ */
+ ept_sync_global();
+}
+
+static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+
+ guard(mutex)(&kvm->slots_lock);
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+ /*
+ * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
+ * TDH.MEM.PAGE.ADD().
+ */
+ if (atomic64_read(&kvm_tdx->nr_premapped))
+ return -EINVAL;
+
+ cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
+ if (tdx_operand_busy(cmd->hw_error))
+ return -EBUSY;
+ if (KVM_BUG_ON(cmd->hw_error, kvm)) {
+ pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+ return -EIO;
+ }
+
+ kvm_tdx->state = TD_STATE_RUNNABLE;
+ /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
+ smp_wmb();
+ kvm->arch.pre_fault_allowed = true;
+ return 0;
+}
+
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_tdx_cmd tdx_cmd;
+ int r;
+
+ if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+ return -EFAULT;
+
+ /*
+ * Userspace should never set hw_error. It is used to fill
+ * hardware-defined error by the kernel.
+ */
+ if (tdx_cmd.hw_error)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ switch (tdx_cmd.id) {
+ case KVM_TDX_CAPABILITIES:
+ r = tdx_get_capabilities(&tdx_cmd);
+ break;
+ case KVM_TDX_INIT_VM:
+ r = tdx_td_init(kvm, &tdx_cmd);
+ break;
+ case KVM_TDX_FINALIZE_VM:
+ r = tdx_td_finalize(kvm, &tdx_cmd);
+ break;
+ default:
+ r = -EINVAL;
+ goto out;
+ }
+
+ if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
+ r = -EFAULT;
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct page *page;
+ int ret, i;
+ u64 err;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ tdx->vp.tdvpr_page = page;
+
+ tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
+ GFP_KERNEL);
+ if (!tdx->vp.tdcx_pages) {
+ ret = -ENOMEM;
+ goto free_tdvpr;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto free_tdcx;
+ }
+ tdx->vp.tdcx_pages[i] = page;
+ }
+
+ err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_CREATE, err);
+ goto free_tdcx;
+ }
+
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_ADDCX, err);
+ /*
+ * Pages already added are reclaimed by the vcpu_free
+ * method, but the rest are freed here.
+ */
+ for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ return -EIO;
+ }
+ }
+
+ err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_INIT, err);
+ return -EIO;
+ }
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+ return 0;
+
+free_tdcx:
+ for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+ if (tdx->vp.tdcx_pages[i])
+ __free_page(tdx->vp.tdcx_pages[i]);
+ tdx->vp.tdcx_pages[i] = NULL;
+ }
+ kfree(tdx->vp.tdcx_pages);
+ tdx->vp.tdcx_pages = NULL;
+
+free_tdvpr:
+ if (tdx->vp.tdvpr_page)
+ __free_page(tdx->vp.tdvpr_page);
+ tdx->vp.tdvpr_page = 0;
+
+ return ret;
+}
+
+/* Sometimes reads multipple subleafs. Return how many enties were written. */
+static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
+ struct kvm_cpuid_entry2 *output_e)
+{
+ int sub_leaf = 0;
+ int ret;
+
+ /* First try without a subleaf */
+ ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
+
+ /* If success, or invalid leaf, just give up */
+ if (ret != -EIO)
+ return ret;
+
+ /*
+ * If the try without a subleaf failed, try reading subleafs until
+ * failure. The TDX module only supports 6 bits of subleaf index.
+ */
+ while (1) {
+ /* Keep reading subleafs until there is a failure. */
+ if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
+ return !sub_leaf;
+
+ sub_leaf++;
+ output_e++;
+ }
+
+ return 0;
+}
+
+static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm_cpuid2 __user *output, *td_cpuid;
+ int r = 0, i = 0, leaf;
+ u32 level;
+
+ output = u64_to_user_ptr(cmd->data);
+ td_cpuid = kzalloc(sizeof(*td_cpuid) +
+ sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
+ GFP_KERNEL);
+ if (!td_cpuid)
+ return -ENOMEM;
+
+ if (copy_from_user(td_cpuid, output, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ /* Read max CPUID for normal range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[0].eax;
+
+ for (leaf = 1; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ /* Read max CPUID for extended range */
+ if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
+ r = -EIO;
+ goto out;
+ }
+ level = td_cpuid->entries[i - 1].eax;
+
+ for (leaf = 0x80000001; leaf <= level; leaf++)
+ tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+
+ if (td_cpuid->nent < i)
+ r = -E2BIG;
+ td_cpuid->nent = i;
+
+ if (copy_to_user(output, td_cpuid, sizeof(*output))) {
+ r = -EFAULT;
+ goto out;
+ }
+
+ if (r == -E2BIG)
+ goto out;
+
+ if (copy_to_user(output->entries, td_cpuid->entries,
+ td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ r = -EFAULT;
+
+out:
+ kfree(td_cpuid);
+
+ return r;
+}
+
+static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ u64 apic_base;
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int ret;
+
+ if (cmd->flags)
+ return -EINVAL;
+
+ if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
+ return -EINVAL;
+
+ /*
+ * TDX requires X2APIC, userspace is responsible for configuring guest
+ * CPUID accordingly.
+ */
+ apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
+ if (kvm_apic_set_base(vcpu, apic_base, true))
+ return -EINVAL;
+
+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
+ if (ret)
+ return ret;
+
+ td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
+ td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
+
+ tdx->state = VCPU_TD_STATE_INITIALIZED;
+
+ return 0;
+}
+
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ /*
+ * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
+ * INIT events.
+ *
+ * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
+ * userspace needs to define the vCPU model before KVM can initialize
+ * vCPU state, e.g. to enable x2APIC.
+ */
+ WARN_ON_ONCE(init_event);
+}
+
+struct tdx_gmem_post_populate_arg {
+ struct kvm_vcpu *vcpu;
+ __u32 flags;
+};
+
+static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ void __user *src, int order, void *_arg)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct tdx_gmem_post_populate_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u8 level = PG_LEVEL_4K;
+ struct page *src_page;
+ int ret, i;
+ u64 err, entry, level_state;
+
+ /*
+ * Get the source page if it has been faulted in. Return failure if the
+ * source page has been swapped out or unmapped in primary memory.
+ */
+ ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
+ if (ret < 0)
+ return ret;
+ if (ret != 1)
+ return -ENOMEM;
+
+ ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * The private mem cannot be zapped after kvm_tdp_map_page()
+ * because all paths are covered by slots_lock and the
+ * filemap invalidate lock. Check that they are indeed enough.
+ */
+ if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ scoped_guard(read_lock, &kvm->mmu_lock) {
+ if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
+ ret = -EIO;
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+ err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+ src_page, &entry, &level_state);
+ if (err) {
+ ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
+ goto out;
+ }
+
+ if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
+ atomic64_dec(&kvm_tdx->nr_premapped);
+
+ if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
+ &level_state);
+ if (err) {
+ ret = -EIO;
+ break;
+ }
+ }
+ }
+
+out:
+ put_page(src_page);
+ return ret;
+}
+
+static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_mem_region region;
+ struct tdx_gmem_post_populate_arg arg;
+ long gmem_ret;
+ int ret;
+
+ if (tdx->state != VCPU_TD_STATE_INITIALIZED)
+ return -EINVAL;
+
+ guard(mutex)(&kvm->slots_lock);
+
+ /* Once TD is finalized, the initial guest memory is fixed. */
+ if (kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+ return -EINVAL;
+
+ if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
+ return -EFAULT;
+
+ if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
+ !region.nr_pages ||
+ region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa) ||
+ !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
+ return -EINVAL;
+
+ kvm_mmu_reload(vcpu);
+ ret = 0;
+ while (region.nr_pages) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ arg = (struct tdx_gmem_post_populate_arg) {
+ .vcpu = vcpu,
+ .flags = cmd->flags,
+ };
+ gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
+ u64_to_user_ptr(region.source_addr),
+ 1, tdx_gmem_post_populate, &arg);
+ if (gmem_ret < 0) {
+ ret = gmem_ret;
+ break;
+ }
+
+ if (gmem_ret != 1) {
+ ret = -EIO;
+ break;
+ }
+
+ region.source_addr += PAGE_SIZE;
+ region.gpa += PAGE_SIZE;
+ region.nr_pages--;
+
+ cond_resched();
+ }
+
+ if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
+ ret = -EFAULT;
+ return ret;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.hw_error)
+ return -EINVAL;
+
+ switch (cmd.id) {
+ case KVM_TDX_INIT_VCPU:
+ ret = tdx_vcpu_init(vcpu, &cmd);
+ break;
+ case KVM_TDX_INIT_MEM_REGION:
+ ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
+ case KVM_TDX_GET_CPUID:
+ ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+{
+ return PG_LEVEL_4K;
+}
+
+static int tdx_online_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+ int r;
+
+ /* Sanity check CPU is already in post-VMXON */
+ WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
+
+ local_irq_save(flags);
+ r = tdx_cpu_enable();
+ local_irq_restore(flags);
+
+ return r;
+}
+
+static int tdx_offline_cpu(unsigned int cpu)
+{
+ int i;
+
+ /* No TD is running. Allow any cpu to be offline. */
+ if (!atomic_read(&nr_configured_hkid))
+ return 0;
+
+ /*
+ * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
+ * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
+ * controller with pconfig. If we have active TDX HKID, refuse to
+ * offline the last online cpu.
+ */
+ for_each_online_cpu(i) {
+ /*
+ * Found another online cpu on the same package.
+ * Allow to offline.
+ */
+ if (i != cpu && topology_physical_package_id(i) ==
+ topology_physical_package_id(cpu))
+ return 0;
+ }
+
+ /*
+ * This is the last cpu of this package. Don't offline it.
+ *
+ * Because it's hard for human operator to understand the
+ * reason, warn it.
+ */
+#define MSG_ALLPKG_ONLINE \
+ "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
+ pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
+ return -EBUSY;
+}
+
+static void __do_tdx_cleanup(void)
+{
+ /*
+ * Once TDX module is initialized, it cannot be disabled and
+ * re-initialized again w/o runtime update (which isn't
+ * supported by kernel). Only need to remove the cpuhp here.
+ * The TDX host core code tracks TDX status and can handle
+ * 'multiple enabling' scenario.
+ */
+ WARN_ON_ONCE(!tdx_cpuhp_state);
+ cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
+ tdx_cpuhp_state = 0;
+}
+
+static void __tdx_cleanup(void)
+{
+ cpus_read_lock();
+ __do_tdx_cleanup();
+ cpus_read_unlock();
+}
+
+static int __init __do_tdx_bringup(void)
+{
+ int r;
+
+ /*
+ * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
+ * online CPUs before calling tdx_enable(), and on any new
+ * going-online CPU to make sure it is ready for TDX guest.
+ */
+ r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
+ "kvm/cpu/tdx:online",
+ tdx_online_cpu, tdx_offline_cpu);
+ if (r < 0)
+ return r;
+
+ tdx_cpuhp_state = r;
+
+ r = tdx_enable();
+ if (r)
+ __do_tdx_cleanup();
+
+ return r;
+}
+
+static int __init __tdx_bringup(void)
+{
+ const struct tdx_sys_info_td_conf *td_conf;
+ int r, i;
+
+ for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
+ /*
+ * Check if MSRs (tdx_uret_msrs) can be saved/restored
+ * before returning to user space.
+ *
+ * this_cpu_ptr(user_return_msrs)->registered isn't checked
+ * because the registration is done at vcpu runtime by
+ * tdx_user_return_msr_update_cache().
+ */
+ tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
+ if (tdx_uret_msrs[i].slot == -1) {
+ /* If any MSR isn't supported, it is a KVM bug */
+ pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
+ tdx_uret_msrs[i].msr);
+ return -EIO;
+ }
+ }
+
+ /*
+ * Enabling TDX requires enabling hardware virtualization first,
+ * as making SEAMCALLs requires CPU being in post-VMXON state.
+ */
+ r = kvm_enable_virtualization();
+ if (r)
+ return r;
+
+ cpus_read_lock();
+ r = __do_tdx_bringup();
+ cpus_read_unlock();
+
+ if (r)
+ goto tdx_bringup_err;
+
+ /* Get TDX global information for later use */
+ tdx_sysinfo = tdx_get_sysinfo();
+ if (WARN_ON_ONCE(!tdx_sysinfo)) {
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ /* Check TDX module and KVM capabilities */
+ if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
+ !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
+ goto get_sysinfo_err;
+
+ if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
+ goto get_sysinfo_err;
+
+ /*
+ * TDX has its own limit of maximum vCPUs it can support for all
+ * TDX guests in addition to KVM_MAX_VCPUS. Userspace needs to
+ * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
+ * extension on per-VM basis.
+ *
+ * TDX module reports such limit via the MAX_VCPU_PER_TD global
+ * metadata. Different modules may report different values.
+ * Some old module may also not support this metadata (in which
+ * case this limit is U16_MAX).
+ *
+ * In practice, the reported value reflects the maximum logical
+ * CPUs that ALL the platforms that the module supports can
+ * possibly have.
+ *
+ * Simply forwarding the MAX_VCPU_PER_TD to userspace could
+ * result in an unpredictable ABI. KVM instead always advertise
+ * the number of logical CPUs the platform has as the maximum
+ * vCPUs for TDX guests.
+ *
+ * Make sure MAX_VCPU_PER_TD reported by TDX module is not
+ * smaller than the number of logical CPUs, otherwise KVM will
+ * report an unsupported value to userspace.
+ *
+ * Note, a platform with TDX enabled in the BIOS cannot support
+ * physical CPU hotplug, and TDX requires the BIOS has marked
+ * all logical CPUs in MADT table as enabled. Just use
+ * num_present_cpus() for the number of logical CPUs.
+ */
+ td_conf = &tdx_sysinfo->td_conf;
+ if (td_conf->max_vcpus_per_td < num_present_cpus()) {
+ pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
+ td_conf->max_vcpus_per_td, num_present_cpus());
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
+ r = -EINVAL;
+ goto get_sysinfo_err;
+ }
+
+ /*
+ * Leave hardware virtualization enabled after TDX is enabled
+ * successfully. TDX CPU hotplug depends on this.
+ */
+ return 0;
+
+get_sysinfo_err:
+ __tdx_cleanup();
+tdx_bringup_err:
+ kvm_disable_virtualization();
+ return r;
+}
+
+void tdx_cleanup(void)
+{
+ if (enable_tdx) {
+ misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+ __tdx_cleanup();
+ kvm_disable_virtualization();
+ }
+}
+
+int __init tdx_bringup(void)
+{
+ int r, i;
+
+ /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
+
+ if (!enable_tdx)
+ return 0;
+
+ if (!enable_ept) {
+ pr_err("EPT is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
+ pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_apicv) {
+ pr_err("APICv is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ pr_err("tdx: OSXSAVE is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+ pr_err("tdx: MOVDIR64B is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
+ pr_err("Self-snoop is required for TDX\n");
+ goto success_disable_tdx;
+ }
+
+ if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
+ pr_err("tdx: no TDX private KeyIDs available\n");
+ goto success_disable_tdx;
+ }
+
+ if (!enable_virt_at_load) {
+ pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
+ goto success_disable_tdx;
+ }
+
+ /*
+ * Ideally KVM should probe whether TDX module has been loaded
+ * first and then try to bring it up. But TDX needs to use SEAMCALL
+ * to probe whether the module is loaded (there is no CPUID or MSR
+ * for that), and making SEAMCALL requires enabling virtualization
+ * first, just like the rest steps of bringing up TDX module.
+ *
+ * So, for simplicity do everything in __tdx_bringup(); the first
+ * SEAMCALL will return -ENODEV when the module is not loaded. The
+ * only complication is having to make sure that initialization
+ * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
+ * cases.
+ */
+ r = __tdx_bringup();
+ if (r) {
+ /*
+ * Disable TDX only but don't fail to load module if
+ * the TDX module could not be loaded. No need to print
+ * message saying "module is not loaded" because it was
+ * printed when the first SEAMCALL failed.
+ */
+ if (r == -ENODEV)
+ goto success_disable_tdx;
+
+ enable_tdx = 0;
+ }
+
+ return r;
+
+success_disable_tdx:
+ enable_tdx = 0;
+ return 0;
+}
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
new file mode 100644
index 000000000000..51f98443e8a2
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -0,0 +1,204 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_TDX_H
+#define __KVM_X86_VMX_TDX_H
+
+#include "tdx_arch.h"
+#include "tdx_errno.h"
+
+#ifdef CONFIG_KVM_INTEL_TDX
+#include "common.h"
+
+int tdx_bringup(void);
+void tdx_cleanup(void);
+
+extern bool enable_tdx;
+
+/* TDX module hardware states. These follow the TDX module OP_STATEs. */
+enum kvm_tdx_state {
+ TD_STATE_UNINITIALIZED = 0,
+ TD_STATE_INITIALIZED,
+ TD_STATE_RUNNABLE,
+};
+
+struct kvm_tdx {
+ struct kvm kvm;
+
+ struct misc_cg *misc_cg;
+ int hkid;
+ enum kvm_tdx_state state;
+
+ u64 attributes;
+ u64 xfam;
+
+ u64 tsc_offset;
+ u64 tsc_multiplier;
+
+ struct tdx_td td;
+
+ /* For KVM_TDX_INIT_MEM_REGION. */
+ atomic64_t nr_premapped;
+
+ /*
+ * Prevent vCPUs from TD entry to ensure SEPT zap related SEAMCALLs do
+ * not contend with tdh_vp_enter() and TDCALLs.
+ * Set/unset is protected with kvm->mmu_lock.
+ */
+ bool wait_for_sept_zap;
+};
+
+/* TDX module vCPU states */
+enum vcpu_tdx_state {
+ VCPU_TD_STATE_UNINITIALIZED = 0,
+ VCPU_TD_STATE_INITIALIZED,
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
+ u64 ext_exit_qualification;
+ gpa_t exit_gpa;
+ struct tdx_module_args vp_enter_args;
+
+ struct tdx_vp vp;
+
+ struct list_head cpu_list;
+
+ u64 vp_enter_ret;
+
+ enum vcpu_tdx_state state;
+ bool guest_entered;
+
+ u64 map_gpa_next;
+ u64 map_gpa_end;
+};
+
+void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err);
+void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+ u64 val, u64 err);
+
+static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
+{
+ u64 err, data;
+
+ err = tdh_mng_rd(&kvm_tdx->td, TDCS_EXEC(field), &data);
+ if (unlikely(err)) {
+ pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
+ return 0;
+ }
+ return data;
+}
+
+static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
+{
+#define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL
+#define VMCS_ENC_ACCESS_TYPE_FULL 0x0UL
+#define VMCS_ENC_ACCESS_TYPE_HIGH 0x1UL
+#define VMCS_ENC_ACCESS_TYPE(field) ((field) & VMCS_ENC_ACCESS_TYPE_MASK)
+
+ /* TDX is 64bit only. HIGH field isn't supported. */
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) &&
+ VMCS_ENC_ACCESS_TYPE(field) == VMCS_ENC_ACCESS_TYPE_HIGH,
+ "Read/Write to TD VMCS *_HIGH fields not supported");
+
+ BUILD_BUG_ON(bits != 16 && bits != 32 && bits != 64);
+
+#define VMCS_ENC_WIDTH_MASK GENMASK(14, 13)
+#define VMCS_ENC_WIDTH_16BIT (0UL << 13)
+#define VMCS_ENC_WIDTH_64BIT (1UL << 13)
+#define VMCS_ENC_WIDTH_32BIT (2UL << 13)
+#define VMCS_ENC_WIDTH_NATURAL (3UL << 13)
+#define VMCS_ENC_WIDTH(field) ((field) & VMCS_ENC_WIDTH_MASK)
+
+ /* TDX is 64bit only. i.e. natural width = 64bit. */
+ BUILD_BUG_ON_MSG(bits != 64 && __builtin_constant_p(field) &&
+ (VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_64BIT ||
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_NATURAL),
+ "Invalid TD VMCS access for 64-bit field");
+ BUILD_BUG_ON_MSG(bits != 32 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_32BIT,
+ "Invalid TD VMCS access for 32-bit field");
+ BUILD_BUG_ON_MSG(bits != 16 && __builtin_constant_p(field) &&
+ VMCS_ENC_WIDTH(field) == VMCS_ENC_WIDTH_16BIT,
+ "Invalid TD VMCS access for 16-bit field");
+}
+
+static __always_inline void tdvps_management_check(u64 field, u8 bits) {}
+static __always_inline void tdvps_state_non_arch_check(u64 field, u8 bits) {}
+
+#define TDX_BUILD_TDVPS_ACCESSORS(bits, uclass, lclass) \
+static __always_inline u##bits td_##lclass##_read##bits(struct vcpu_tdx *tdx, \
+ u32 field) \
+{ \
+ u64 err, data; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_rd(&tdx->vp, TDVPS_##uclass(field), &data); \
+ if (unlikely(err)) { \
+ tdh_vp_rd_failed(tdx, #uclass, field, err); \
+ return 0; \
+ } \
+ return (u##bits)data; \
+} \
+static __always_inline void td_##lclass##_write##bits(struct vcpu_tdx *tdx, \
+ u32 field, u##bits val) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), val, \
+ GENMASK_ULL(bits - 1, 0)); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " = ", field, (u64)val, err); \
+} \
+static __always_inline void td_##lclass##_setbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), bit, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " |= ", field, bit, err); \
+} \
+static __always_inline void td_##lclass##_clearbit##bits(struct vcpu_tdx *tdx, \
+ u32 field, u64 bit) \
+{ \
+ u64 err; \
+ \
+ tdvps_##lclass##_check(field, bits); \
+ err = tdh_vp_wr(&tdx->vp, TDVPS_##uclass(field), 0, bit); \
+ if (unlikely(err)) \
+ tdh_vp_wr_failed(tdx, #uclass, " &= ~", field, bit, err);\
+}
+
+
+bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu);
+int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err);
+
+TDX_BUILD_TDVPS_ACCESSORS(16, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(32, VMCS, vmcs);
+TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
+
+TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
+TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
+
+#else
+static inline int tdx_bringup(void) { return 0; }
+static inline void tdx_cleanup(void) {}
+
+#define enable_tdx 0
+
+struct kvm_tdx {
+ struct kvm kvm;
+};
+
+struct vcpu_tdx {
+ struct kvm_vcpu vcpu;
+};
+
+static inline bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu) { return false; }
+static inline int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { return 0; }
+
+#endif
+
+#endif
diff --git a/arch/x86/kvm/vmx/tdx_arch.h b/arch/x86/kvm/vmx/tdx_arch.h
new file mode 100644
index 000000000000..a30e880849e3
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_arch.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural constants/data definitions for TDX SEAMCALLs */
+
+#ifndef __KVM_X86_TDX_ARCH_H
+#define __KVM_X86_TDX_ARCH_H
+
+#include <linux/types.h>
+
+/* TDX control structure (TDR/TDCS/TDVPS) field access codes */
+#define TDX_NON_ARCH BIT_ULL(63)
+#define TDX_CLASS_SHIFT 56
+#define TDX_FIELD_MASK GENMASK_ULL(31, 0)
+
+#define __BUILD_TDX_FIELD(non_arch, class, field) \
+ (((non_arch) ? TDX_NON_ARCH : 0) | \
+ ((u64)(class) << TDX_CLASS_SHIFT) | \
+ ((u64)(field) & TDX_FIELD_MASK))
+
+#define BUILD_TDX_FIELD(class, field) \
+ __BUILD_TDX_FIELD(false, (class), (field))
+
+#define BUILD_TDX_FIELD_NON_ARCH(class, field) \
+ __BUILD_TDX_FIELD(true, (class), (field))
+
+
+/* Class code for TD */
+#define TD_CLASS_EXECUTION_CONTROLS 17ULL
+
+/* Class code for TDVPS */
+#define TDVPS_CLASS_VMCS 0ULL
+#define TDVPS_CLASS_GUEST_GPR 16ULL
+#define TDVPS_CLASS_OTHER_GUEST 17ULL
+#define TDVPS_CLASS_MANAGEMENT 32ULL
+
+enum tdx_tdcs_execution_control {
+ TD_TDCS_EXEC_TSC_OFFSET = 10,
+ TD_TDCS_EXEC_TSC_MULTIPLIER = 11,
+};
+
+enum tdx_vcpu_guest_other_state {
+ TD_VCPU_STATE_DETAILS_NON_ARCH = 0x100,
+};
+
+#define TDX_VCPU_STATE_DETAILS_INTR_PENDING BIT_ULL(0)
+
+static inline bool tdx_vcpu_state_details_intr_pending(u64 vcpu_state_details)
+{
+ return !!(vcpu_state_details & TDX_VCPU_STATE_DETAILS_INTR_PENDING);
+}
+
+/* @field is any of enum tdx_tdcs_execution_control */
+#define TDCS_EXEC(field) BUILD_TDX_FIELD(TD_CLASS_EXECUTION_CONTROLS, (field))
+
+/* @field is the VMCS field encoding */
+#define TDVPS_VMCS(field) BUILD_TDX_FIELD(TDVPS_CLASS_VMCS, (field))
+
+/* @field is any of enum tdx_guest_other_state */
+#define TDVPS_STATE(field) BUILD_TDX_FIELD(TDVPS_CLASS_OTHER_GUEST, (field))
+#define TDVPS_STATE_NON_ARCH(field) BUILD_TDX_FIELD_NON_ARCH(TDVPS_CLASS_OTHER_GUEST, (field))
+
+/* Management class fields */
+enum tdx_vcpu_guest_management {
+ TD_VCPU_PEND_NMI = 11,
+};
+
+/* @field is any of enum tdx_vcpu_guest_management */
+#define TDVPS_MANAGEMENT(field) BUILD_TDX_FIELD(TDVPS_CLASS_MANAGEMENT, (field))
+
+#define TDX_EXTENDMR_CHUNKSIZE 256
+
+struct tdx_cpuid_value {
+ u32 eax;
+ u32 ebx;
+ u32 ecx;
+ u32 edx;
+} __packed;
+
+#define TDX_TD_ATTR_DEBUG BIT_ULL(0)
+#define TDX_TD_ATTR_SEPT_VE_DISABLE BIT_ULL(28)
+#define TDX_TD_ATTR_PKS BIT_ULL(30)
+#define TDX_TD_ATTR_KL BIT_ULL(31)
+#define TDX_TD_ATTR_PERFMON BIT_ULL(63)
+
+#define TDX_EXT_EXIT_QUAL_TYPE_MASK GENMASK(3, 0)
+#define TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION 6
+/*
+ * TD_PARAMS is provided as an input to TDH_MNG_INIT, the size of which is 1024B.
+ */
+struct td_params {
+ u64 attributes;
+ u64 xfam;
+ u16 max_vcpus;
+ u8 reserved0[6];
+
+ u64 eptp_controls;
+ u64 config_flags;
+ u16 tsc_frequency;
+ u8 reserved1[38];
+
+ u64 mrconfigid[6];
+ u64 mrowner[6];
+ u64 mrownerconfig[6];
+ u64 reserved2[4];
+
+ union {
+ DECLARE_FLEX_ARRAY(struct tdx_cpuid_value, cpuid_values);
+ u8 reserved3[768];
+ };
+} __packed __aligned(1024);
+
+/*
+ * Guest uses MAX_PA for GPAW when set.
+ * 0: GPA.SHARED bit is GPA[47]
+ * 1: GPA.SHARED bit is GPA[51]
+ */
+#define TDX_CONFIG_FLAGS_MAX_GPAW BIT_ULL(0)
+
+/*
+ * TDH.VP.ENTER, TDG.VP.VMCALL preserves RBP
+ * 0: RBP can be used for TDG.VP.VMCALL input. RBP is clobbered.
+ * 1: RBP can't be used for TDG.VP.VMCALL input. RBP is preserved.
+ */
+#define TDX_CONFIG_FLAGS_NO_RBP_MOD BIT_ULL(2)
+
+
+/*
+ * TDX requires the frequency to be defined in units of 25MHz, which is the
+ * frequency of the core crystal clock on TDX-capable platforms, i.e. the TDX
+ * module can only program frequencies that are multiples of 25MHz. The
+ * frequency must be between 100mhz and 10ghz (inclusive).
+ */
+#define TDX_TSC_KHZ_TO_25MHZ(tsc_in_khz) ((tsc_in_khz) / (25 * 1000))
+#define TDX_TSC_25MHZ_TO_KHZ(tsc_in_25mhz) ((tsc_in_25mhz) * (25 * 1000))
+#define TDX_MIN_TSC_FREQUENCY_KHZ (100 * 1000)
+#define TDX_MAX_TSC_FREQUENCY_KHZ (10 * 1000 * 1000)
+
+/* Additional Secure EPT entry information */
+#define TDX_SEPT_LEVEL_MASK GENMASK_ULL(2, 0)
+#define TDX_SEPT_STATE_MASK GENMASK_ULL(15, 8)
+#define TDX_SEPT_STATE_SHIFT 8
+
+enum tdx_sept_entry_state {
+ TDX_SEPT_FREE = 0,
+ TDX_SEPT_BLOCKED = 1,
+ TDX_SEPT_PENDING = 2,
+ TDX_SEPT_PENDING_BLOCKED = 3,
+ TDX_SEPT_PRESENT = 4,
+};
+
+static inline u8 tdx_get_sept_level(u64 sept_entry_info)
+{
+ return sept_entry_info & TDX_SEPT_LEVEL_MASK;
+}
+
+static inline u8 tdx_get_sept_state(u64 sept_entry_info)
+{
+ return (sept_entry_info & TDX_SEPT_STATE_MASK) >> TDX_SEPT_STATE_SHIFT;
+}
+
+#define MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM BIT_ULL(20)
+
+/*
+ * TD scope metadata field ID.
+ */
+#define TD_MD_FIELD_ID_CPUID_VALUES 0x9410000300000000ULL
+
+#endif /* __KVM_X86_TDX_ARCH_H */
diff --git a/arch/x86/kvm/vmx/tdx_errno.h b/arch/x86/kvm/vmx/tdx_errno.h
new file mode 100644
index 000000000000..6ff4672c4181
--- /dev/null
+++ b/arch/x86/kvm/vmx/tdx_errno.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* architectural status code for SEAMCALL */
+
+#ifndef __KVM_X86_TDX_ERRNO_H
+#define __KVM_X86_TDX_ERRNO_H
+
+#define TDX_SEAMCALL_STATUS_MASK 0xFFFFFFFF00000000ULL
+
+/*
+ * TDX SEAMCALL Status Codes (returned in RAX)
+ */
+#define TDX_NON_RECOVERABLE_VCPU 0x4000000100000000ULL
+#define TDX_NON_RECOVERABLE_TD 0x4000000200000000ULL
+#define TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE 0x6000000500000000ULL
+#define TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE 0x6000000700000000ULL
+#define TDX_INTERRUPTED_RESUMABLE 0x8000000300000000ULL
+#define TDX_OPERAND_INVALID 0xC000010000000000ULL
+#define TDX_OPERAND_BUSY 0x8000020000000000ULL
+#define TDX_PREVIOUS_TLB_EPOCH_BUSY 0x8000020100000000ULL
+#define TDX_PAGE_METADATA_INCORRECT 0xC000030000000000ULL
+#define TDX_VCPU_NOT_ASSOCIATED 0x8000070200000000ULL
+#define TDX_KEY_GENERATION_FAILED 0x8000080000000000ULL
+#define TDX_KEY_STATE_INCORRECT 0xC000081100000000ULL
+#define TDX_KEY_CONFIGURED 0x0000081500000000ULL
+#define TDX_NO_HKID_READY_TO_WBCACHE 0x0000082100000000ULL
+#define TDX_FLUSHVP_NOT_DONE 0x8000082400000000ULL
+#define TDX_EPT_WALK_FAILED 0xC0000B0000000000ULL
+#define TDX_EPT_ENTRY_STATE_INCORRECT 0xC0000B0D00000000ULL
+#define TDX_METADATA_FIELD_NOT_READABLE 0xC0000C0200000000ULL
+
+/*
+ * TDX module operand ID, appears in 31:0 part of error code as
+ * detail information
+ */
+#define TDX_OPERAND_ID_RCX 0x01
+#define TDX_OPERAND_ID_TDR 0x80
+#define TDX_OPERAND_ID_SEPT 0x92
+#define TDX_OPERAND_ID_TD_EPOCH 0xa9
+
+#endif /* __KVM_X86_TDX_ERRNO_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 01936013428b..56fd150a6f24 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -188,12 +188,13 @@ struct __packed vmcs12 {
};
/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
- * older kvm releases.
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -206,7 +207,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 22411f4aff53..b12414108cbf 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -46,6 +46,7 @@
#include <asm/perf_event.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/msr.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
@@ -53,6 +54,7 @@
#include <trace/events/ipi.h>
#include "capabilities.h"
+#include "common.h"
#include "cpuid.h"
#include "hyperv.h"
#include "kvm_onhyperv.h"
@@ -68,10 +70,13 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
+#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -214,9 +219,11 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
@@ -257,7 +264,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -268,6 +275,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
@@ -375,9 +383,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
if (!vmx->disable_fb_clear)
return;
- msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+ msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL);
msr |= FB_CLEAR_DIS;
- native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr);
/* Cache the MSR value to avoid reading it later */
vmx->msr_ia32_mcu_opt_ctrl = msr;
}
@@ -388,7 +396,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
return;
vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
- native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+ native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
}
static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
@@ -402,7 +410,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -478,10 +486,9 @@ noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
ext, vpid, gva);
}
-noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+noinline void invept_error(unsigned long ext, u64 eptp)
{
- vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
- ext, eptp, gpa);
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -522,16 +529,10 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
-static struct kvm_x86_ops vmx_x86_ops __initdata;
-
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -581,9 +582,8 @@ static __init void hv_init_evmcs(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
+ vt_x86_ops.enable_l2_tlb_flush
= hv_enable_l2_tlb_flush;
-
} else {
enlightened_vmcs = false;
}
@@ -755,7 +755,7 @@ fault:
return -EIO;
}
-static void vmx_emergency_disable(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@@ -874,6 +874,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -1060,7 +1066,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
- wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+ wrmsrq(MSR_IA32_PEBS_ENABLE, 0);
}
i = vmx_find_loadstore_msr_slot(&m->guest, msr);
@@ -1118,12 +1124,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1136,7 +1142,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1189,13 +1195,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1203,13 +1209,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
- rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
- rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
- rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
+ rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
+ rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
+ rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
- rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
- rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
+ rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
@@ -1222,9 +1228,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
+ wrmsrq(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
@@ -1245,7 +1251,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
* i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
*/
if (vmx->pt_desc.host.ctl)
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
@@ -1278,6 +1284,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
@@ -1306,7 +1313,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
if (vmx->nested.need_vmcs12_to_shadow_sync)
nested_sync_vmcs12_to_shadow(vcpu);
- if (vmx->guest_state_loaded)
+ if (vt->guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1327,15 +1334,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
- vmx->msr_host_kernel_gs_base = current->thread.gsbase;
+ vt->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
- vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+ vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
@@ -1344,14 +1351,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
#endif
vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
- vmx->guest_state_loaded = true;
+ vt->guest_state_loaded = true;
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
- if (!vmx->guest_state_loaded)
+ if (!vmx->vt.guest_state_loaded)
return;
host_state = &vmx->loaded_vmcs->host_state;
@@ -1359,7 +1366,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
#ifdef CONFIG_X86_64
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
@@ -1379,10 +1386,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
- vmx->guest_state_loaded = false;
+ vmx->vt.guest_state_loaded = false;
vmx->guest_uret_msrs_loaded = false;
}
@@ -1390,8 +1397,8 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+ if (vmx->vt.guest_state_loaded)
+ rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
@@ -1399,13 +1406,45 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
- if (vmx->guest_state_loaded)
- wrmsrl(MSR_KERNEL_GS_BASE, data);
+ if (vmx->vt.guest_state_loaded)
+ wrmsrq(MSR_KERNEL_GS_BASE, data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1442,7 +1481,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* performs IBPB on nested VM-Exit (a single nested transition
* may switch the active VMCS multiple times).
*/
- if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+ if (static_branch_likely(&switch_vcpu_ibpb) &&
+ (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)))
indirect_branch_prediction_barrier();
}
@@ -1477,18 +1517,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1544,10 +1583,10 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
@@ -1598,7 +1637,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1653,8 +1693,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1663,16 +1703,22 @@ static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
*/
- if (to_vmx(vcpu)->exit_reason.enclave_mode) {
+ if (vmx_get_exit_reason(vcpu).enclave_mode) {
kvm_queue_exception(vcpu, UD_VECTOR);
return X86EMUL_PROPAGATE_FAULT;
}
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
return X86EMUL_CONTINUE;
}
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
unsigned long rip, orig_rip;
u32 instr_len;
@@ -1738,7 +1784,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1769,7 +1815,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
}
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1788,7 +1834,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
@@ -1819,7 +1865,7 @@ static void vmx_inject_exception(struct kvm_vcpu *vcpu)
return;
}
- WARN_ON_ONCE(vmx->emulation_required);
+ WARN_ON_ONCE(vmx->vt.emulation_required);
if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
@@ -1870,8 +1916,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
@@ -1909,12 +1955,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
@@ -1957,15 +2003,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
+ switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
@@ -1974,7 +2020,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2024,7 +2070,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2040,13 +2086,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
@@ -2059,7 +2105,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
- if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
#endif
@@ -2129,7 +2175,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
u64 data)
{
#ifdef CONFIG_X86_64
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return (u32)data;
#endif
return (unsigned long)data;
@@ -2140,7 +2186,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
u64 debugctl = 0;
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
@@ -2155,7 +2201,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2244,9 +2290,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
@@ -2346,7 +2392,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* behavior, but it's close enough.
*/
if (!msr_info->host_initiated &&
- (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
@@ -2356,7 +2402,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2411,7 +2457,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
@@ -2419,8 +2465,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
case MSR_IA32_PERF_CAPABILITIES:
- if (data && !vcpu_to_pmu(vcpu)->version)
- return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
(kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
@@ -2432,9 +2476,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PERF_CAP_PEBS_MASK) !=
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
@@ -2458,7 +2502,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2512,30 +2556,6 @@ static bool cpu_has_sgx(void)
return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
-/*
- * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
- * can't be used due to errata where VM Exit may incorrectly clear
- * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
- * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
- */
-static bool cpu_has_perf_global_ctrl_bug(void)
-{
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
- case INTEL_FAM6_NEHALEM: /* AAP115 */
- case INTEL_FAM6_WESTMERE: /* AAT100 */
- case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
- case INTEL_FAM6_NEHALEM_EX: /* BA97 */
- return true;
- default:
- break;
- }
- }
-
- return false;
-}
-
static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -2558,23 +2578,50 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
- rdmsrl(msr, allowed);
+ rdmsrq(msr, allowed);
return ctl_opt & allowed;
}
+#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \
+({ \
+ int i, r = 0; \
+ \
+ BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \
+ BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \
+ \
+ for (i = 0; i < ARRAY_SIZE(pairs); i++) { \
+ typeof(entry_controls) n_ctrl = pairs[i].entry_control; \
+ typeof(exit_controls) x_ctrl = pairs[i].exit_control; \
+ \
+ if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \
+ continue; \
+ \
+ pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \
+ "entry = %llx (%llx), exit = %llx (%llx)\n", \
+ (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \
+ (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \
+ \
+ if (error_on_inconsistent_vmcs_config) \
+ r = -EIO; \
+ \
+ entry_controls &= ~n_ctrl; \
+ exit_controls &= ~x_ctrl; \
+ } \
+ r; \
+})
+
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
u64 misc_msr;
- int i;
/*
* LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
@@ -2606,6 +2653,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2630,6 +2680,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -2674,46 +2725,54 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_vmentry_control))
return -EIO;
- for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
- u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
- u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
-
- if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
- continue;
-
- pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
- _vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
-
- if (error_on_inconsistent_vmcs_config)
- return -EIO;
+ if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs,
+ _vmentry_control, _vmexit_control))
+ return -EIO;
- _vmentry_control &= ~n_ctrl;
- _vmexit_control &= ~x_ctrl;
+ /*
+ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
+ * can't be used due to an errata where VM Exit may incorrectly clear
+ * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
+ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
}
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
- rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
-
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
+ rdmsrq(MSR_IA32_VMX_MISC, misc_msr);
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@ -2759,7 +2818,7 @@ static bool kvm_is_vmx_supported(void)
return supported;
}
-static int vmx_check_processor_compat(void)
+int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
@@ -2795,13 +2854,13 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer)
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
}
-static int vmx_hardware_enable(void)
+int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2825,9 +2884,6 @@ static int vmx_hardware_enable(void)
return r;
}
- if (enable_ept)
- ept_sync_global();
-
return 0;
}
@@ -2841,7 +2897,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void vmx_hardware_disable(void)
+void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@@ -2863,13 +2919,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -2962,7 +3018,7 @@ static __init int alloc_kvm_area(void)
* physical CPU.
*/
if (kvm_is_using_evmcs())
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -3155,7 +3211,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3180,12 +3236,12 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 root_hpa = mmu->root.hpa;
@@ -3201,7 +3257,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
@@ -3210,7 +3266,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
@@ -3255,7 +3311,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
-static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (is_guest_mode(vcpu))
return nested_guest_cr0_valid(vcpu, cr0);
@@ -3352,7 +3408,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = vmx_emulation_required(vcpu);
+ vmx->vt.emulation_required = vmx_emulation_required(vcpu);
}
static int vmx_get_max_ept_level(void)
@@ -3376,8 +3432,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3406,8 +3461,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
@@ -3484,7 +3538,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -3523,7 +3577,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3534,16 +3588,29 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ar;
if (unlikely(vmx->rmode.vm86_active))
return 0;
- else {
- int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
- return VMX_AR_DPL(ar);
- }
+
+ if (no_cache)
+ ar = vmcs_read32(GUEST_SS_AR_BYTES);
+ else
+ ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, false);
+}
+
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, true);
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -3600,14 +3667,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}
-static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
- to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
+ to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3615,25 +3682,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -4101,27 +4168,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
@@ -4152,55 +4199,18 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu);
}
-static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- int pi_vec)
-{
-#ifdef CONFIG_SMP
- if (vcpu->mode == IN_GUEST_MODE) {
- /*
- * The vector of the virtual has already been set in the PIR.
- * Send a notification event to deliver the virtual interrupt
- * unless the vCPU is the currently running vCPU, i.e. the
- * event is being sent from a fastpath VM-Exit handler, in
- * which case the PIR will be synced to the vIRR before
- * re-entering the guest.
- *
- * When the target is not the running vCPU, the following
- * possibilities emerge:
- *
- * Case 1: vCPU stays in non-root mode. Sending a notification
- * event posts the interrupt to the vCPU.
- *
- * Case 2: vCPU exits to root mode and is still runnable. The
- * PIR will be synced to the vIRR before re-entering the guest.
- * Sending a notification event is ok as the host IRQ handler
- * will ignore the spurious event.
- *
- * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
- * has already synced PIR to vIRR and never blocks the vCPU if
- * the vIRR is not empty. Therefore, a blocked vCPU here does
- * not wait for any requested interrupts in PIR, and sending a
- * notification event also results in a benign, spurious event.
- */
-
- if (vcpu != kvm_get_running_vcpu())
- __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return;
- }
-#endif
- /*
- * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
- * otherwise do nothing as KVM will grab the highest priority pending
- * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
- */
- kvm_vcpu_wake_up(vcpu);
-}
-
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -4237,7 +4247,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
@@ -4248,25 +4258,12 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (!vcpu->arch.apic->apicv_active)
return -1;
- if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return 0;
-
- /* If a previous notification has sent the IPI, nothing to do. */
- if (pi_test_and_set_on(&vmx->pi_desc))
- return 0;
-
- /*
- * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
- * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
- * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
- * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
- */
- kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+ __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector);
return 0;
}
-static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -4341,7 +4338,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
- rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+ rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
@@ -4350,7 +4347,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4399,9 +4396,6 @@ static u32 vmx_vmentry_ctrl(void)
VM_ENTRY_LOAD_IA32_EFER |
VM_ENTRY_IA32E_MODE);
- if (cpu_has_perf_global_ctrl_bug())
- vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
-
return vmentry_ctrl;
}
@@ -4419,16 +4413,12 @@ static u32 vmx_vmexit_ctrl(void)
if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
-
- if (cpu_has_perf_global_ctrl_bug())
- vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
-
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4538,7 +4528,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* Update the nested MSR settings so that a nested VMM can/can't set
* controls for features that are/aren't exposed to the guest.
*/
- if (nested) {
+ if (nested &&
+ kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
/*
* All features that can be added or removed to VMX MSRs must
* be supported in the first place for nested virtualization.
@@ -4564,10 +4555,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
bool __enabled; \
\
if (cpu_has_vmx_##name()) { \
- if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
- __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
- else \
- __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
__enabled, exiting); \
} \
@@ -4594,6 +4582,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4642,8 +4631,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
@@ -4692,7 +4681,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
return 0;
}
-static int vmx_vcpu_precreate(struct kvm *kvm)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
return vmx_alloc_ipiv_pid_table(kvm);
}
@@ -4717,8 +4706,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -4732,7 +4725,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc)));
}
if (vmx_can_use_ipiv(&vmx->vcpu)) {
@@ -4789,7 +4782,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
@@ -4823,7 +4816,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcs(vmx);
- if (nested)
+ if (nested &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
vcpu_setup_sgx_lepubkeyhash(vcpu);
@@ -4836,18 +4830,19 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
#endif
- vcpu->arch.microcode_version = 0x100000000ULL;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x100000000ULL;
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
/*
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
* or POSTED_INTR_WAKEUP_VECTOR.
*/
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+ __pi_set_sn(&vmx->vt.pi_desc);
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4862,9 +4857,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- vmx_segment_cache_clear(vmx);
- kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
-
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
@@ -4891,6 +4883,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -4906,12 +4901,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4922,7 +4917,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
@@ -4950,7 +4945,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5028,7 +5023,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5040,17 +5035,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5065,7 +5065,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -5085,7 +5085,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -5173,6 +5173,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
}
+static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.guest_fpu.fpstate->xfd &&
+ !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5199,13 +5205,24 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* point.
*/
if (is_nm_fault(intr_info)) {
- kvm_queue_exception(vcpu, NM_VECTOR);
+ kvm_queue_exception_p(vcpu, NM_VECTOR,
+ is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0);
return 1;
}
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5371,8 +5388,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -5578,7 +5594,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5597,7 +5613,13 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5734,11 +5756,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
gpa_t gpa;
- u64 error_code;
-
- exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
@@ -5754,24 +5773,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(vcpu, gpa, exit_qualification);
- /* Is it a read fault? */
- error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
- ? PFERR_USER_MASK : 0;
- /* Is it a write fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
- ? PFERR_WRITE_MASK : 0;
- /* Is it a fetch fault? */
- error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
- ? PFERR_FETCH_MASK : 0;
- /* ept page table entry is present? */
- error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
- ? PFERR_PRESENT_MASK : 0;
-
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
- vcpu->arch.exit_qualification = exit_qualification;
-
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
* a guest page fault. We have to emulate the instruction here, because
@@ -5783,7 +5784,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
return kvm_emulate_instruction(vcpu, 0);
- return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification);
}
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
@@ -5819,11 +5820,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
-static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+/*
+ * Returns true if emulation is required (due to the vCPU having invalid state
+ * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the
+ * current vCPU state.
+ */
+static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ if (!vmx->vt.emulation_required)
+ return false;
+
+ /*
+ * It is architecturally impossible for emulation to be required when a
+ * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
+ * guest state is invalid and unrestricted guest is disabled, i.e. KVM
+ * should synthesize VM-Fail instead emulation L2 code. This path is
+ * only reachable if userspace modifies L2 guest state after KVM has
+ * performed the nested VM-Enter consistency checks.
+ */
+ if (vmx->nested.nested_run_pending)
+ return true;
+
+ /*
+ * KVM only supports emulating exceptions if the vCPU is in Real Mode.
+ * If emulation is required, KVM can't perform a successful VM-Enter to
+ * inject the exception.
+ */
+ return !vmx->rmode.vm86_active &&
(kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}
@@ -5836,7 +5861,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
intr_window_requested = exec_controls_get(vmx) &
CPU_BASED_INTR_WINDOW_EXITING;
- while (vmx->emulation_required && count-- != 0) {
+ while (vmx->vt.emulation_required && count-- != 0) {
if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
@@ -5846,7 +5871,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5868,9 +5893,9 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
- if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ if (vmx_unhandleable_emulation_required(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5878,38 +5903,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5945,7 +5938,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
} operand;
int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6035,7 +6028,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu)
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -6063,7 +6056,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
* VM-Exits. Unconditionally set the flag here and leave the handling to
* vmx_handle_exit().
*/
- to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ to_vt(vcpu)->exit_reason.bus_lock_detected = true;
return 1;
}
@@ -6156,15 +6149,14 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
- u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- *reason = vmx->exit_reason.full;
+ *reason = vmx->vt.exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
- if (!(vmx->exit_reason.failed_vmentry)) {
+ if (!(vmx->vt.exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
*intr_info = vmx_get_intr_info(vcpu);
if (is_exception_with_error_code(*intr_info))
@@ -6178,6 +6170,15 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
}
}
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
if (vmx->pml_pg) {
@@ -6189,32 +6190,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -6416,6 +6425,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -6425,7 +6452,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- union vmx_exit_reason exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
@@ -6481,7 +6508,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* the least awful solution for the userspace case without
* risking false positives.
*/
- if (vmx->emulation_required) {
+ if (vmx->vt.emulation_required) {
nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
return 1;
}
@@ -6491,7 +6518,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
}
/* If guest state is invalid, start emulating. L2 is handled above. */
- if (vmx->emulation_required)
+ if (vmx->vt.emulation_required)
return handle_invalid_guest_state(vcpu);
if (exit_reason.failed_vmentry) {
@@ -6512,33 +6539,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
- exit_reason.basic != EXIT_REASON_NOTIFY)) {
- int ndata = 3;
-
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
- if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.data[ndata++] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
- vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
- vcpu->run->internal.ndata = ndata;
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
@@ -6601,7 +6610,7 @@ unexpected_vmexit:
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
@@ -6609,7 +6618,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* Exit to user space when bus lock detected to inform that there is
* a bus lock in guest.
*/
- if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
+ if (vmx_get_exit_reason(vcpu).bus_lock_detected) {
if (ret > 0)
vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
@@ -6641,9 +6650,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
+ * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@@ -6662,7 +6672,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
@@ -6689,7 +6699,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6759,14 +6769,16 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *slot;
+ struct page *refcounted_page;
unsigned long mmu_seq;
kvm_pfn_t pfn;
+ bool writable;
/* Defer reload until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
@@ -6802,37 +6814,58 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
* controls the APIC-access page memslot, and only deletes the memslot
* if APICv is permanently inhibited, i.e. the memslot won't reappear.
*/
- pfn = gfn_to_pfn_memslot(slot, gfn);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
if (is_error_noslot_pfn(pfn))
return;
read_lock(&vcpu->kvm->mmu_lock);
- if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- read_unlock(&vcpu->kvm->mmu_lock);
- goto out;
- }
+ else
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
- vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
- read_unlock(&vcpu->kvm->mmu_lock);
+ /*
+ * Do not pin the APIC access page in memory so that it can be freely
+ * migrated, the MMU notifier will call us again if it is migrated or
+ * swapped out. KVM backs the memslot with anonymous memory, the pfn
+ * should always point at a refcounted page (if the pfn is valid).
+ */
+ if (!WARN_ON_ONCE(!refcounted_page))
+ kvm_release_page_clean(refcounted_page);
/*
* No need for a manual TLB flush at this point, KVM has already done a
* flush if there were SPTEs pointing at the previous page.
*/
-out:
- /*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
- */
- kvm_release_pfn_clean(pfn);
+ read_unlock(&vcpu->kvm->mmu_lock);
}
-static void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6862,38 +6895,24 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vcpu_vt *vt = to_vt(vcpu);
int max_irr;
bool got_posted_interrupt;
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
- if (pi_test_on(&vmx->pi_desc)) {
- pi_clear_on(&vmx->pi_desc);
+ if (pi_test_on(&vt->pi_desc)) {
+ pi_clear_on(&vt->pi_desc);
/*
* IOMMU can write to PID.ON, so the barrier matters even on UP.
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
got_posted_interrupt =
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
+ kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
got_posted_interrupt = false;
@@ -6922,7 +6941,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6933,14 +6952,6 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- pi_clear_on(&vmx->pi_desc);
- memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
-}
-
void vmx_do_interrupt_irqoff(unsigned long entry);
void vmx_do_nmi_irqoff(void);
@@ -6951,37 +6962,34 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
* MSR value is not clobbered by the host activity before the guest
* has chance to consume it.
*
- * Do not blindly read xfd_err here, since this exception might
- * be caused by L1 interception on a platform which doesn't
- * support xfd at all.
+ * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM
+ * interception may have been caused by L1 interception. Per the SDM,
+ * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1.
*
- * Do it conditionally upon guest_fpu::xfd. xfd_err matters
- * only when xfd contains a non-zero value.
- *
- * Queuing exception is done in vmx_handle_exit. See comment there.
+ * Note, XFD_ERR is updated _before_ the #NM interception check, i.e.
+ * unlike CR2 and DR6, the value is not a payload that is attached to
+ * the #NM exception.
*/
- if (vcpu->arch.guest_fpu.fpstate->xfd)
- rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+ if (is_xfd_nm_fault(vcpu))
+ rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* if exit due to NM, handle before interrupts are enabled */
else if (is_nm_fault(intr_info))
- handle_nm_fault_irqoff(&vmx->vcpu);
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
@@ -6998,24 +7006,22 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (vmx->emulation_required)
+ if (to_vt(vcpu)->emulation_required)
return;
- if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_irqoff(vmx);
+ if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
+ else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI)
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7113,13 +7119,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
- case INTR_TYPE_HARD_EXCEPTION:
- if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
- u32 err = vmcs_read32(error_code_field);
- kvm_requeue_exception_e(vcpu, vector, err);
- } else
- kvm_requeue_exception(vcpu, vector);
+ case INTR_TYPE_HARD_EXCEPTION: {
+ u32 error_code = 0;
+
+ if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK)
+ error_code = vmcs_read32(error_code_field);
+
+ kvm_requeue_exception(vcpu, vector,
+ idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK,
+ error_code);
break;
+ }
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
fallthrough;
@@ -7138,7 +7148,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -7214,7 +7224,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
return;
if (flags & VMX_RUN_SAVE_SPEC_CTRL)
- vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
+ vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL);
/*
* If the guest/host SPEC_CTRL values differ, restore the host value.
@@ -7225,7 +7235,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
*/
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
vmx->spec_ctrl != hostval)
- native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+ native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval);
barrier_nospec();
}
@@ -7238,19 +7248,35 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
* the fastpath even, all other exits must use the slow path.
*/
if (is_guest_mode(vcpu) &&
- to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER)
return EXIT_FASTPATH_NONE;
- switch (to_vmx(vcpu)->exit_reason.basic) {
+ switch (vmx_get_exit_reason(vcpu).basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
}
+noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu)
+{
+ if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI ||
+ !is_nmi(vmx_get_intr_info(vcpu)))
+ return;
+
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+}
+
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
unsigned int flags)
{
@@ -7263,10 +7289,14 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
* mitigation for MDS is done late in VMentry and is still
* executed in spite of L1D Flush. This is because an extra VERW
* should not matter much after the big hammer L1D Flush.
+ *
+ * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA,
+ * and is affected by MMIO Stale Data. In such cases mitigation in only
+ * needed against an MMIO capable guest.
*/
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ else if (static_branch_unlikely(&cpu_buf_vm_clear) &&
kvm_arch_has_assigned_device(vcpu->kvm))
mds_clear_cpu_buffers();
@@ -7286,29 +7316,21 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ vmx->vt.exit_reason.full = 0xdead;
goto out;
}
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
- if (likely(!vmx->exit_reason.failed_vmentry))
+ vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
- is_nmi(vmx_get_intr_info(vcpu))) {
- kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- if (cpu_feature_enabled(X86_FEATURE_FRED))
- fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
- else
- vmx_do_nmi_irqoff();
- kvm_after_interrupt(vcpu);
- }
+ vmx_handle_nmi(vcpu);
out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7323,15 +7345,15 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
* start emulation until we arrive back to a valid state. Synthesize a
* consistency check VM-Exit due to invalid guest state and bail.
*/
- if (unlikely(vmx->emulation_required)) {
+ if (unlikely(vmx->vt.emulation_required)) {
vmx->fail = 0;
- vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
- vmx->exit_reason.failed_vmentry = 1;
+ vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->vt.exit_reason.failed_vmentry = 1;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
- vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT;
kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
- vmx->exit_intr_info = 0;
+ vmx->vt.exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
}
@@ -7373,10 +7395,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7412,8 +7430,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -7438,7 +7456,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
* checking.
*/
if (vmx->nested.nested_run_pending &&
- !vmx->exit_reason.failed_vmentry)
+ !vmx_get_exit_reason(vcpu).failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
@@ -7447,12 +7465,12 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
+ if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
trace_kvm_exit(vcpu, KVM_ISA_VMX);
- if (unlikely(vmx->exit_reason.failed_vmentry))
+ if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
@@ -7463,7 +7481,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7472,9 +7490,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7483,7 +7502,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
- INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+ INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list);
err = -ENOMEM;
@@ -7565,9 +7584,23 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
- __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
+ __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID);
return 0;
@@ -7583,7 +7616,7 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7594,6 +7627,7 @@ static int vmx_vm_init(struct kvm *kvm)
case L1TF_MITIGATION_FLUSH_NOWARN:
/* 'I explicitly don't care' is set */
break;
+ case L1TF_MITIGATION_AUTO:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
case L1TF_MITIGATION_FULL:
@@ -7611,44 +7645,37 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
+
+ if (enable_pml)
+ kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES;
return 0;
}
-static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+static inline bool vmx_ignore_guest_pat(struct kvm *kvm)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Non-coherent DMA devices need the guest to flush CPU properly.
+ * In that case it is not possible to map all guest RAM as WB, so
+ * always trust guest PAT.
*/
+ return !kvm_arch_has_noncoherent_dma(kvm) &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT);
+}
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+{
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
+ */
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ /* Force WB if ignoring guest PAT */
+ if (vmx_ignore_guest_pat(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7786,7 +7813,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7795,12 +7822,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
* set if and only if XSAVE is supported.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
@@ -7808,7 +7831,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7817,25 +7840,25 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
struct vmx_uret_msr *msr;
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
if (kvm_cpu_cap_has(X86_FEATURE_XFD))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
if (boot_cpu_has(X86_FEATURE_IBPB))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
@@ -7843,17 +7866,17 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
else
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_SGX_LC_ENABLED;
else
@@ -7873,7 +7896,7 @@ static __init u64 vmx_get_perf_capabilities(void)
return 0;
if (boot_cpu_has(X86_FEATURE_PDCM))
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
+ rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
x86_perf_get_lbr(&vmx_lbr_caps);
@@ -7947,6 +7970,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
@@ -7967,66 +7991,86 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info)
+static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ unsigned long *exit_qualification)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned short port;
- bool intercept;
int size;
+ bool imm;
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
port = info->src_val;
size = info->dst_bytes;
+ imm = info->src_type == OP_IMM;
} else {
port = info->dst_val;
size = info->src_bytes;
+ imm = info->dst_type == OP_IMM;
}
- /*
- * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
- * VM-exits depend on the 'unconditional IO exiting' VM-execution
- * control.
- *
- * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
- */
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- intercept = nested_cpu_has(vmcs12,
- CPU_BASED_UNCOND_IO_EXITING);
- else
- intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
- return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ *exit_qualification = ((unsigned long)port << 16) | (size - 1);
+
+ if (info->intercept == x86_intercept_ins ||
+ info->intercept == x86_intercept_outs)
+ *exit_qualification |= BIT(4);
+
+ if (info->rep_prefix)
+ *exit_qualification |= BIT(5);
+
+ if (imm)
+ *exit_qualification |= BIT(6);
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned long exit_qualification = 0;
+ u32 vm_exit_reason;
+ u64 exit_insn_len;
switch (info->intercept) {
- /*
- * RDPID causes #UD if disabled through secondary execution controls.
- * Because it is marked as EmulateOnUD, we need to intercept it here.
- * Note, RDPID is hidden behind ENABLE_RDTSCP.
- */
case x86_intercept_rdpid:
+ /*
+ * RDPID causes #UD if not enabled through secondary execution
+ * controls (ENABLE_RDTSCP). Note, the implicit MSR access to
+ * TSC_AUX is NOT subject to interception, i.e. checking only
+ * the dedicated execution control is architecturally correct.
+ */
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
exception->vector = UD_VECTOR;
exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
- break;
+ return X86EMUL_CONTINUE;
case x86_intercept_in:
case x86_intercept_ins:
case x86_intercept_out:
case x86_intercept_outs:
- return vmx_check_intercept_io(vcpu, info);
+ if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_IO_INSTRUCTION;
+ break;
case x86_intercept_lgdt:
case x86_intercept_lidt:
@@ -8039,7 +8083,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
return X86EMUL_CONTINUE;
- /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ if (info->intercept == x86_intercept_lldt ||
+ info->intercept == x86_intercept_ltr ||
+ info->intercept == x86_intercept_sldt ||
+ info->intercept == x86_intercept_str)
+ vm_exit_reason = EXIT_REASON_LDTR_TR;
+ else
+ vm_exit_reason = EXIT_REASON_GDTR_IDTR;
+ /*
+ * FIXME: Decode the ModR/M to generate the correct exit
+ * qualification for memory operands.
+ */
+ break;
+
+ case x86_intercept_hlt:
+ if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING))
+ return X86EMUL_CONTINUE;
+
+ vm_exit_reason = EXIT_REASON_HLT;
break;
case x86_intercept_pause:
@@ -8052,17 +8113,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
* the PAUSE.
*/
if ((info->rep_prefix != REPE_PREFIX) ||
- !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
+ !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING))
return X86EMUL_CONTINUE;
+ vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION;
break;
/* TODO: check more intercepts... */
default:
- break;
+ return X86EMUL_UNHANDLEABLE;
}
- return X86EMUL_UNHANDLEABLE;
+ exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip);
+ if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH)
+ return X86EMUL_UNHANDLEABLE;
+
+ __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification,
+ exit_insn_len);
+ return X86EMUL_INTERCEPTED;
}
#ifdef CONFIG_X86_64
@@ -8084,8 +8152,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -8124,18 +8192,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8159,7 +8221,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -8170,7 +8232,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_SMM
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -8178,7 +8240,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8199,7 +8261,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -8220,18 +8282,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -8241,7 +8303,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void vmx_hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -8251,18 +8313,7 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-#define VMX_REQUIRED_APICV_INHIBITS \
-( \
- BIT(APICV_INHIBIT_REASON_DISABLE)| \
- BIT(APICV_INHIBIT_REASON_ABSENT) | \
- BIT(APICV_INHIBIT_REASON_HYPERV) | \
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
- BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
-)
-
-static void vmx_vm_destroy(struct kvm *kvm)
+void vmx_vm_destroy(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
@@ -8313,148 +8364,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = KBUILD_MODNAME,
-
- .check_processor_compatibility = vmx_check_processor_compat,
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
- .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
-#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-#endif
-
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-
- .get_untagged_addr = vmx_get_untagged_addr,
-};
-
static unsigned int vmx_handle_intel_pt_intr(void)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
@@ -8500,18 +8409,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
@@ -8520,9 +8427,7 @@ static void __init vmx_setup_me_spte_mask(void)
kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata;
-
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
@@ -8536,15 +8441,11 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
- if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
-
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_MPX)) {
- rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
+ rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs);
WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
@@ -8591,16 +8492,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
- vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8615,7 +8516,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
- vmx_x86_ops.sync_pir_to_irr = NULL;
+ vt_x86_ops.sync_pir_to_irr = NULL;
if (!enable_apicv || !cpu_has_vmx_ipiv())
enable_ipiv = false;
@@ -8633,6 +8534,8 @@ static __init int hardware_setup(void)
if (enable_ept)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
+ else
+ vt_x86_ops.get_mt_mask = NULL;
/*
* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
@@ -8650,9 +8553,6 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
enable_pml = 0;
- if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
-
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8660,7 +8560,7 @@ static __init int hardware_setup(void)
u64 use_timer_freq = 5000ULL * 1000 * 1000;
cpu_preemption_timer_multi =
- vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -8676,8 +8576,8 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8688,9 +8588,9 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
- vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
else
- vmx_init_ops.handle_intel_pt_intr = NULL;
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -8710,17 +8610,30 @@ static __init int hardware_setup(void)
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+ /*
+ * On Intel CPUs that lack self-snoop feature, letting the guest control
+ * memory types may result in unexpected behavior. So always ignore guest
+ * PAT on those CPUs and map VM as writeback, not allowing userspace to
+ * disable the quirk.
+ *
+ * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is
+ * supported, UC is slow enough to cause issues with some older guests (e.g.
+ * an old version of bochs driver uses ioremap() instead of ioremap_wc() to
+ * map the video RAM, causing wayland desktop to fail to get started
+ * correctly). To avoid breaking those older guests that rely on KVM to force
+ * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the
+ * safer (for performance) default behavior.
+ *
+ * On top of this, non-coherent DMA devices need the guest to flush CPU
+ * caches properly. This also requires honoring guest PAT, and is forced
+ * independent of the quirk in vmx_ignore_guest_pat().
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+ kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
return r;
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
-};
-
static void vmx_cleanup_l1d_flush(void)
{
if (vmx_l1d_flush_pages) {
@@ -8731,25 +8644,16 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void __vmx_exit(void)
+void vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
vmx_cleanup_l1d_flush();
-}
-static void vmx_exit(void)
-{
- kvm_exit();
kvm_x86_vendor_exit();
-
- __vmx_exit();
}
-module_exit(vmx_exit);
-static int __init vmx_init(void)
+int __init vmx_init(void)
{
int r, cpu;
@@ -8762,7 +8666,7 @@ static int __init vmx_init(void)
*/
hv_init_evmcs();
- r = kvm_x86_vendor_init(&vmx_init_ops);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
@@ -8783,8 +8687,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
vmx_check_vmcs12_offsets();
/*
@@ -8795,21 +8697,9 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
- /*
- * Common KVM initialization _must_ come last, after this, /dev/kvm is
- * exposed to userspace!
- */
- r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
- THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
return 0;
-err_kvm_init:
- __vmx_exit();
err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
-module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 90f9e4434646..6d1e40ecc024 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -7,19 +7,17 @@
#include <asm/kvm.h>
#include <asm/intel_pt.h>
#include <asm/perf_event.h>
+#include <asm/posted_intr.h>
#include "capabilities.h"
#include "../kvm_cache_regs.h"
-#include "posted_intr.h"
+#include "pmu_intel.h"
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
#include "run_flags.h"
#include "../mmu.h"
-
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-#define MSR_TYPE_RW 3
+#include "common.h"
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
@@ -71,47 +69,6 @@ struct pt_desc {
struct pt_ctx guest;
};
-union vmx_exit_reason {
- struct {
- u32 basic : 16;
- u32 reserved16 : 1;
- u32 reserved17 : 1;
- u32 reserved18 : 1;
- u32 reserved19 : 1;
- u32 reserved20 : 1;
- u32 reserved21 : 1;
- u32 reserved22 : 1;
- u32 reserved23 : 1;
- u32 reserved24 : 1;
- u32 reserved25 : 1;
- u32 bus_lock_detected : 1;
- u32 enclave_mode : 1;
- u32 smi_pending_mtf : 1;
- u32 smi_from_vmx_root : 1;
- u32 reserved30 : 1;
- u32 failed_vmentry : 1;
- };
- u32 full;
-};
-
-struct lbr_desc {
- /* Basic info about guest LBR records. */
- struct x86_pmu_lbr records;
-
- /*
- * Emulate LBR feature via passthrough LBR registers when the
- * per-vcpu guest LBR event is scheduled on the current pcpu.
- *
- * The records may be inaccurate if the host reclaims the LBR.
- */
- struct perf_event *event;
-
- /* True if LBRs are marked as not intercepted in the MSR bitmap */
- bool msr_passthrough;
-};
-
-extern struct x86_pmu_lbr vmx_lbr_caps;
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -180,6 +137,7 @@ struct nested_vmx {
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
bool update_vmcs01_apicv_status;
+ bool update_vmcs01_hwapic_isr;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -204,8 +162,6 @@ struct nested_vmx {
struct kvm_host_map virtual_apic_map;
struct kvm_host_map pi_desc_map;
- struct kvm_host_map msr_bitmap_map;
-
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -253,20 +209,10 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
+ struct vcpu_vt vt;
u8 fail;
u8 x2apic_msr_bitmap_mode;
- /*
- * If true, host state has been stored in vmx->loaded_vmcs for
- * the CPU registers that only need to be switched when transitioning
- * to/from the kernel, and the registers have been loaded with guest
- * values. If false, host state is loaded in the CPU registers
- * and vmx->loaded_vmcs->host_state is invalid.
- */
- bool guest_state_loaded;
-
- unsigned long exit_qualification;
- u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
@@ -279,7 +225,6 @@ struct vcpu_vmx {
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
- u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
@@ -318,15 +263,6 @@ struct vcpu_vmx {
} seg[8];
} segment_cache;
int vpid;
- bool emulation_required;
-
- union vmx_exit_reason exit_reason;
-
- /* Posted interrupt descriptor */
- struct pi_desc pi_desc;
-
- /* Used if this vCPU is waiting for PI notification wakeup. */
- struct list_head pi_wakeup_list;
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
@@ -336,14 +272,15 @@ struct vcpu_vmx {
bool ple_window_dirty;
/* Support for PML */
-#define PML_ENTITY_NUM 512
+#define PML_LOG_NR_ENTRIES 512
+ /* PML is written backwards: this is the first entry written by the CPU */
+#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+
struct page *pml_pg;
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- unsigned long host_debugctlmsr;
-
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
@@ -365,6 +302,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -377,6 +317,43 @@ struct kvm_vmx {
u64 *pid_table;
};
+static __always_inline struct vcpu_vt *to_vt(struct kvm_vcpu *vcpu)
+{
+ return &(container_of(vcpu, struct vcpu_vmx, vcpu)->vt);
+}
+
+static __always_inline struct kvm_vcpu *vt_to_vcpu(struct vcpu_vt *vt)
+{
+ return &(container_of(vt, struct vcpu_vmx, vt)->vcpu);
+}
+
+static __always_inline union vmx_exit_reason vmx_get_exit_reason(struct kvm_vcpu *vcpu)
+{
+ return to_vt(vcpu)->exit_reason;
+}
+
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ return vt->exit_qualification;
+}
+
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vt *vt = to_vt(vcpu);
+
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2) &&
+ !WARN_ON_ONCE(is_td_vcpu(vcpu)))
+ vt->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ return vt->exit_intr_info;
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy);
int allocate_vpid(void);
@@ -386,6 +363,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -403,6 +381,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
@@ -577,7 +556,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
@@ -660,45 +640,10 @@ static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static inline struct lbr_desc *vcpu_to_lbr_desc(struct kvm_vcpu *vcpu)
-{
- return &to_vmx(vcpu)->lbr_desc;
-}
-
-static inline struct x86_pmu_lbr *vcpu_to_lbr_records(struct kvm_vcpu *vcpu)
-{
- return &vcpu_to_lbr_desc(vcpu)->records;
-}
-
-static inline bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu)
-{
- return !!vcpu_to_lbr_records(vcpu)->nr;
-}
-
void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
-static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
- vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- return vmx->exit_qualification;
-}
-
-static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- return vmx->exit_intr_info;
-}
-
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -723,7 +668,7 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
return true;
return allow_smaller_maxphyaddr &&
- cpuid_maxphyaddr(vcpu) < kvm_get_shadow_phys_bits();
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
@@ -751,4 +696,12 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
+int vmx_init(void);
+void vmx_exit(void);
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index eb48153bfd73..cdf8cbb69209 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -3,7 +3,7 @@
#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__
#define __ARCH_X86_KVM_VMX_ONHYPERV_H__
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <linux/jump_label.h>
@@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 8060e5fc6dbd..96677576c836 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -15,7 +15,7 @@ void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
-void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+void invept_error(unsigned long ext, u64 eptp);
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/*
@@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
+ "16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}
@@ -118,7 +118,7 @@ do_exception:
#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
- asm volatile("1: vmread %2, %1\n\t"
+ asm volatile("1: vmread %[field], %[output]\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -127,24 +127,26 @@ do_exception:
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "xorl %k1, %k1\n\t"
+ "xorl %k[output], %k[output]\n\t"
"2:\n\t"
- "push %1\n\t"
- "push %2\n\t"
+ "push %[output]\n\t"
+ "push %[field]\n\t"
"call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
* memory for @fault so that the result is '0' on error.
*/
- "pop %2\n\t"
- "pop %1\n\t"
+ "pop %[field]\n\t"
+ "pop %[output]\n\t"
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output])
- : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
+ : ASM_CALL_CONSTRAINT, [output] "=&r" (value)
+ : [field] "r" (field)
+ : "cc");
return value;
#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
@@ -312,13 +314,13 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
}
-static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp)
{
struct {
- u64 eptp, gpa;
- } operand = {eptp, gpa};
-
- vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
+ u64 eptp;
+ u64 reserved_0;
+ } operand = { eptp, 0 };
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp);
}
static inline void vpid_sync_vcpu_single(int vpid)
@@ -355,13 +357,13 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
static inline void ept_sync_global(void)
{
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0);
}
static inline void ept_sync_context(u64 eptp)
{
if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp);
else
ept_sync_global();
}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..6bf8be570b2e
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_feature_msr(u32 msr, u64 *data);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#ifdef CONFIG_KVM_INTEL_TDX
+void tdx_disable_virtualization_cpu(void);
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_destroy(struct kvm *kvm);
+int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void tdx_vcpu_free(struct kvm_vcpu *vcpu);
+void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void tdx_vcpu_put(struct kvm_vcpu *vcpu);
+bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int tdx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath);
+
+void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void tdx_inject_nmi(struct kvm_vcpu *vcpu);
+void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+bool tdx_has_emulated_msr(u32 index);
+int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
+
+int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt);
+int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, void *private_spt);
+int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn);
+int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level, kvm_pfn_t pfn);
+
+void tdx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void tdx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn);
+#else
+static inline void tdx_disable_virtualization_cpu(void) {}
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
+static inline void tdx_vm_destroy(struct kvm *kvm) {}
+static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
+
+static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
+static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
+static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
+static inline int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+{
+ return EXIT_FASTPATH_NONE;
+}
+static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
+static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
+static inline bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu) { return false; }
+static inline int tdx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion fastpath) { return 0; }
+
+static inline void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector) {}
+static inline void tdx_inject_nmi(struct kvm_vcpu *vcpu) {}
+static inline void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1,
+ u64 *info2, u32 *intr_info, u32 *error_code) {}
+static inline bool tdx_has_emulated_msr(u32 index) { return false; }
+static inline int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
+static inline int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { return 1; }
+
+static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
+
+static inline int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level,
+ void *private_spt)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level,
+ void *private_spt)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level,
+ kvm_pfn_t pfn)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+ enum pg_level level,
+ kvm_pfn_t pfn)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void tdx_flush_tlb_current(struct kvm_vcpu *vcpu) {}
+static inline void tdx_flush_tlb_all(struct kvm_vcpu *vcpu) {}
+static inline void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) {}
+static inline int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn) { return 0; }
+#endif
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */