diff options
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/hyperv.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/hyperv.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/hyperv_evmcs.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/main.c | 170 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 465 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 30 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/pmu_intel.c | 60 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/posted_intr.h | 101 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 21 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmcs12.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 1279 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 35 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_onhyperv.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 36 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/x86_ops.h | 124 |
19 files changed, 1400 insertions, 971 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 41a4533f9989..cb6588238f46 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -54,9 +54,7 @@ struct nested_vmx_msrs { }; struct vmcs_config { - int size; - u32 basic_cap; - u32 revision_id; + u64 basic; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; u32 cpu_based_2nd_exec_ctrl; @@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init; static inline bool cpu_has_vmx_basic_inout(void) { - return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); + return vmcs_config.basic & VMX_BASIC_INOUT; } static inline bool cpu_has_virtual_nmis(void) @@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void) static inline bool cpu_has_vmx_shadow_vmcs(void) { /* check if the cpu supports writing r/o exit information fields */ - if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void) static inline bool cpu_has_vmx_intel_pt(void) { - return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && + return (vmcs_config.misc & VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c index fab6a1ad98dc..fa41d036acd4 100644 --- a/arch/x86/kvm/vmx/hyperv.c +++ b/arch/x86/kvm/vmx/hyperv.c @@ -4,6 +4,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include "x86.h" #include "../cpuid.h" #include "hyperv.h" #include "nested.h" diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h index a87407412615..11a339009781 100644 --- a/arch/x86/kvm/vmx/hyperv.h +++ b/arch/x86/kvm/vmx/hyperv.h @@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx) return vmx->nested.hv_evmcs; } -static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu) { /* * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h index a543fccfc574..6536290f4274 100644 --- a/arch/x86/kvm/vmx/hyperv_evmcs.h +++ b/arch/x86/kvm/vmx/hyperv_evmcs.h @@ -6,7 +6,7 @@ #ifndef __KVM_X86_VMX_HYPERV_EVMCS_H #define __KVM_X86_VMX_HYPERV_EVMCS_H -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include "capabilities.h" #include "vmcs12.h" diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c new file mode 100644 index 000000000000..43ee9ed11291 --- /dev/null +++ b/arch/x86/kvm/vmx/main.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/moduleparam.h> + +#include "x86_ops.h" +#include "vmx.h" +#include "nested.h" +#include "pmu.h" +#include "posted_intr.h" + +#define VMX_REQUIRED_APICV_INHIBITS \ + (BIT(APICV_INHIBIT_REASON_DISABLED) | \ + BIT(APICV_INHIBIT_REASON_ABSENT) | \ + BIT(APICV_INHIBIT_REASON_HYPERV) | \ + BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ + BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ + BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ + BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED)) + +struct kvm_x86_ops vt_x86_ops __initdata = { + .name = KBUILD_MODNAME, + + .check_processor_compatibility = vmx_check_processor_compat, + + .hardware_unsetup = vmx_hardware_unsetup, + + .enable_virtualization_cpu = vmx_enable_virtualization_cpu, + .disable_virtualization_cpu = vmx_disable_virtualization_cpu, + .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu, + + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + .vm_destroy = vmx_vm_destroy, + + .vcpu_precreate = vmx_vcpu_precreate, + .vcpu_create = vmx_vcpu_create, + .vcpu_free = vmx_vcpu_free, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_switch_to_guest = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_exception_bitmap = vmx_update_exception_bitmap, + .get_feature_msr = vmx_get_feature_msr, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cpl_no_cache = vmx_get_cpl_no_cache, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .is_valid_cr0 = vmx_is_valid_cr0, + .set_cr0 = vmx_set_cr0, + .is_valid_cr4 = vmx_is_valid_cr4, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + .get_if_flag = vmx_get_if_flag, + + .flush_tlb_all = vmx_flush_tlb_all, + .flush_tlb_current = vmx_flush_tlb_current, + .flush_tlb_gva = vmx_flush_tlb_gva, + .flush_tlb_guest = vmx_flush_tlb_guest, + + .vcpu_pre_run = vmx_vcpu_pre_run, + .vcpu_run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .inject_irq = vmx_inject_irq, + .inject_nmi = vmx_inject_nmi, + .inject_exception = vmx_inject_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = vmx_enable_nmi_window, + .enable_irq_window = vmx_enable_irq_window, + .update_cr8_intercept = vmx_update_cr8_intercept, + + .x2apic_icr_is_split = false, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_pre_state_restore = vmx_apicv_pre_state_restore, + .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, + .hwapic_isr_update = vmx_hwapic_isr_update, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_interrupt = vmx_deliver_interrupt, + .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + .get_entry_info = vmx_get_entry_info, + + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .get_l2_tsc_offset = vmx_get_l2_tsc_offset, + .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, + .write_tsc_offset = vmx_write_tsc_offset, + .write_tsc_multiplier = vmx_write_tsc_multiplier, + + .load_mmu_pgd = vmx_load_mmu_pgd, + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .cpu_dirty_log_size = PML_LOG_NR_ENTRIES, + .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, + + .nested_ops = &vmx_nested_ops, + + .pi_update_irte = vmx_pi_update_irte, + .pi_start_assignment = vmx_pi_start_assignment, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + +#ifdef CONFIG_KVM_SMM + .smi_allowed = vmx_smi_allowed, + .enter_smm = vmx_enter_smm, + .leave_smm = vmx_leave_smm, + .enable_smi_window = vmx_enable_smi_window, +#endif + + .check_emulate_instruction = vmx_check_emulate_instruction, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, + + .msr_filter_changed = vmx_msr_filter_changed, + .complete_emulated_msr = kvm_complete_insn_gp, + + .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, + + .get_untagged_addr = vmx_get_untagged_addr, +}; + +struct kvm_x86_init_ops vt_init_ops __initdata = { + .hardware_setup = vmx_hardware_setup, + .handle_intel_pt_intr = NULL, + + .runtime_ops = &vt_x86_ops, + .pmu_ops = &intel_pmu_ops, +}; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index d05ddf751491..5504d9e9fd32 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7,15 +7,16 @@ #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include "x86.h" #include "cpuid.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" -#include "x86.h" #include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; @@ -230,11 +231,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_vmx_is_evmptr12_valid(vmx)) { - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs = NULL; - } - + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); + vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { @@ -259,7 +257,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ - if (!guest_cpuid_has_evmcs(vcpu) || + if (!guest_cpu_cap_has_evmcs(vcpu) || !evmptr_is_valid(nested_get_evmptr(vcpu))) return false; @@ -316,6 +314,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vcpu->arch.regs_dirty = 0; } +static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); + vmx->nested.pi_desc = NULL; +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -348,15 +356,8 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + + nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -409,18 +410,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; u32 vm_exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; + + /* + * It should be impossible to trigger a nested PML Full VM-Exit + * for anything other than an EPT Violation from L2. KVM *can* + * trigger nEPT page fault injection in response to an EPT + * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT + * tables also changed, but KVM should not treat EPT Misconfig + * VM-Exits as writes. + */ + WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + + /* + * PML Full and EPT Violation VM-Exits both use bit 12 to report + * "NMI unblocking due to IRET", i.e. the bit can be propagated + * as-is from the original EXIT_QUALIFICATION. + */ + exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { - if (fault->error_code & PFERR_RSVD_MASK) + if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else + exit_qualification = 0; + } else { + exit_qualification = fault->exit_qualification; + exit_qualification |= vmx_get_exit_qual(vcpu) & + (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } /* * Although the caller (kvm_inject_emulated_page_fault) would @@ -601,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -624,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)map->hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -691,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -958,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr(vcpu, e.index, e.value)) { + if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -994,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr(vcpu, msr_index, data)) { + if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -1089,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() - * instead of reading the value from the vmcs02 VMExit - * MSR-store area. + * nested_vmx_get_vmexit_msr_value() by reading KVM's + * internal MSR state instead of reading the value from + * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", @@ -1174,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -1228,21 +1254,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | + VMX_BASIC_INOUT | + VMX_BASIC_TRUE_CTLS; + + const u64 reserved_bits = GENMASK_ULL(63, 56) | + GENMASK_ULL(47, 45) | + BIT_ULL(31); + u64 vmx_basic = vmcs_config.nested.basic; - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has + * inverted polarity), the incoming value must not set feature bits or + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. + * multi-bit values, are explicitly checked below. + */ + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ - if (data & BIT_ULL(48)) + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != @@ -1311,16 +1348,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_SHUTDOWN | + VMX_MISC_ACTIVITY_WAIT_SIPI | + VMX_MISC_INTEL_PT | + VMX_MISC_RDMSR_IN_SMM | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMXOFF_BLOCK_SMI | + VMX_MISC_ZERO_LEN_INS; + + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * The incoming value must not set feature bits or reserved bits that + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are + * explicitly checked below. + */ + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & @@ -2039,7 +2089,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( bool evmcs_gpa_changed = false; u64 evmcs_gpa; - if (likely(!guest_cpuid_has_evmcs(vcpu))) + if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; evmcs_gpa = nested_get_evmptr(vcpu); @@ -2220,6 +2270,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); + /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); @@ -2265,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -2291,10 +2355,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; - if (nested_cpu_has_posted_intr(vmcs12)) + if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - else + } else { + vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; + } pin_controls_set(vmx, exec_control); /* @@ -2400,7 +2466,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); @@ -2413,7 +2479,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); - if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; @@ -2444,6 +2510,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); @@ -2481,7 +2548,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmx->segment_cache.bitmask = 0; + vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2903,7 +2970,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -2925,7 +2992,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, return -EINVAL; #ifdef CONFIG_KVM_HYPERV - if (guest_cpuid_has_evmcs(vcpu)) + if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); #endif @@ -2943,6 +3010,17 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, return 0; } +static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) +{ + /* + * Check that the given linear address is canonical after a VM exit + * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. + */ + u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; + + return !__is_canonical_address(la, l1_address_bits_on_exit); +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -2953,8 +3031,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && @@ -2988,12 +3066,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) + if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || + CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* @@ -3111,7 +3189,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; @@ -3209,7 +3287,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (guest_cpuid_has_evmcs(vcpu) && + if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3364,7 +3442,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) if (!nested_cpu_has_pml(vmcs12)) return 0; - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } @@ -3403,14 +3481,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3430,7 +3500,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; - bool evaluate_pending_interrupts; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, @@ -3449,13 +3518,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, kvm_service_local_tlb_flush_requests(vcpu); - evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!evaluate_pending_interrupts) - evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); - if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); @@ -3538,9 +3600,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit - * unconditionally. + * unconditionally. Take care to pull data from vmcs01 as appropriate, + * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ - if (unlikely(evaluate_pending_interrupts)) + if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING)) || + kvm_apic_has_pending_init_or_sipi(vcpu) || + kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* @@ -3673,14 +3739,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; - /* Emulate processing of posted interrupts on VM-Enter. */ - if (nested_cpu_has_posted_intr(vmcs12) && - kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); - } - /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3713,7 +3771,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -3874,8 +3932,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; @@ -4006,10 +4064,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } -static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { - return nested_vmx_preemption_timer_pending(vcpu) || - to_vmx(vcpu)->nested.mtf_pending; + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic = vmx->nested.virtual_apic_map.hva; + int max_irr, vppr; + + if (nested_vmx_preemption_timer_pending(vcpu) || + vmx->nested.mtf_pending) + return true; + + /* + * Virtual Interrupt Delivery doesn't require manual injection. Either + * the interrupt is already in GUEST_RVI and will be recognized by CPU + * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move + * the interrupt from the PIR to RVI prior to entering the guest. + */ + if (for_injection) + return false; + + if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || + __vmx_interrupt_blocked(vcpu)) + return false; + + if (!vapic) + return false; + + vppr = *((u32 *)(vapic + APIC_PROCPRI)); + + max_irr = vmx_get_rvi(); + if ((max_irr & 0xf0) > (vppr & 0xf0)) + return true; + + if (vmx->nested.pi_pending && vmx->nested.pi_desc && + pi_test_on(vmx->nested.pi_desc)) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) + return true; + } + + return false; } /* @@ -4106,13 +4200,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* - * New events (not exceptions) are only recognized at instruction + * Events that don't require injection, i.e. that are virtualized by + * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need + * to regain control in order to deliver the event, and hardware will + * handle event ordering, e.g. with respect to injected exceptions. + * + * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a - * VM-Exit that occurred _during_ instruction execution; new events are - * blocked until the instruction completes. + * VM-Exit that occurred _during_ instruction execution; new events, + * irrespective of whether or not they're injected, are blocked until + * the instruction completes. + */ + bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); + /* + * Inject events are blocked by nested VM-Enter, as KVM is responsible + * for managing priority between concurrent events, i.e. KVM needs to + * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || - kvm_event_needs_reinjection(vcpu); + block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -4222,11 +4328,71 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + int irq; + + if (!nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + + goto no_vmexit; + } + + if (!nested_exit_intr_ack_set(vcpu)) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + irq = kvm_cpu_get_extint(vcpu); + if (irq != -1) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + return 0; + } + + irq = kvm_apic_has_interrupt(vcpu); + if (WARN_ON_ONCE(irq < 0)) + goto no_vmexit; + + /* + * If the IRQ is L2's PI notification vector, process posted + * interrupts for L2 instead of injecting VM-Exit, as the + * detection/morphing architecturally occurs when the IRQ is + * delivered to the CPU. Note, only interrupts that are routed + * through the local APIC trigger posted interrupt processing, + * and enabling posted interrupts requires ACK-on-exit. + */ + if (irq == vmx->nested.posted_intr_nv) { + /* + * Nested posted interrupts are delivered via RVI, i.e. + * aren't injected by KVM, and so can be queued even if + * manual event injection is disallowed. + */ + if (block_non_injected_events) + return -EBUSY; + + vmx->nested.pi_pending = true; + kvm_apic_clear_irr(vcpu, irq); + goto no_vmexit; + } + if (block_nested_events) return -EBUSY; - if (!nested_exit_on_intr(vcpu)) - goto no_vmexit; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + + /* + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI + * if APICv is active. + */ + kvm_apic_ack_interrupt(vcpu, irq); return 0; } @@ -4452,7 +4618,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; @@ -4480,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4640,7 +4806,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) - return host_efer; + return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) @@ -4651,7 +4817,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) if (efer_msr) return efer_msr->data; - return host_efer; + return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) @@ -4744,7 +4910,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr(vcpu, h.index, h.value)) { + if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -4764,8 +4930,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4815,7 +4982,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -4860,7 +5028,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, * doesn't isolate different VMCSs, i.e. in this case, doesn't provide * separate modes for L2 vs L1. */ - if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) indirect_branch_prediction_barrier(); /* Update any VMCS fields that might have changed while L2 ran */ @@ -4883,11 +5051,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - /* Unpin physical memory we referred to in vmcs02 */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; @@ -4899,22 +5063,19 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } + if (vmx->nested.update_vmcs01_hwapic_isr) { + vmx->nested.update_vmcs01_hwapic_isr = false; + kvm_apic_update_hwapic_isr(vcpu); + } + if ((vm_exit_reason != -1) && (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, @@ -4925,6 +5086,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -5031,7 +5203,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * non-canonical form. This is the only check on the memory * destination for long mode! */ - exn = is_noncanonical_address(*ret, vcpu); + exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is @@ -5157,9 +5329,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); @@ -5828,6 +5999,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: @@ -5836,7 +6013,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) * invalidation. */ if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) + is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); @@ -6130,7 +6307,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, { u32 encls_leaf; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; @@ -6208,6 +6385,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; + else if (is_ve_fault(intr_info)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; @@ -6466,7 +6645,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (guest_can_use(vcpu, X86_FEATURE_VMX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6607,7 +6786,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6641,7 +6820,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!guest_can_use(vcpu, X86_FEATURE_VMX) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; @@ -6987,7 +7166,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, { msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; @@ -7002,12 +7181,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, + X86_MEMTYPE_WB); + msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index cce4e2aa30fb..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); @@ -39,11 +57,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_vmcs12; } static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) { + lockdep_assert_once(lockdep_is_held(&vcpu->mutex) || + !refcount_read(&vcpu->kvm->users_count)); + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; } @@ -109,7 +133,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu) static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.msrs.misc_low & - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 12ade343a17e..77012b2eca0e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -110,7 +110,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return 0; return vcpu->arch.perf_capabilities; @@ -160,7 +160,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; case MSR_IA32_DS_AREA: - ret = guest_cpuid_has(vcpu, X86_FEATURE_DS); + ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS); break; case MSR_PEBS_DATA_CFG: perf_capabilities = vcpu_get_perf_capabilities(vcpu); @@ -348,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (data & pmu->fixed_ctr_ctrl_mask) + if (data & pmu->fixed_ctr_ctrl_rsvd) return 1; if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; case MSR_IA32_PEBS_ENABLE: - if (data & pmu->pebs_enable_mask) + if (data & pmu->pebs_enable_rsvd) return 1; if (pmu->pebs_enable != data) { @@ -365,13 +365,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; pmu->ds_area = data; break; case MSR_PEBS_DATA_CFG: - if (data & pmu->pebs_data_cfg_mask) + if (data & pmu->pebs_data_cfg_rsvd) return 1; pmu->pebs_data_cfg = data; @@ -436,8 +436,8 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) }; u64 eventsel; - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); - BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS); + BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS); /* * Yell if perf reports support for a fixed counter but perf doesn't @@ -448,6 +448,14 @@ static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) return eventsel; } +static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits); +} + static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -456,8 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) union cpuid10_eax eax; union cpuid10_edx edx; u64 perf_capabilities; - u64 counter_mask; - int i; + u64 counter_rsvd; memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); @@ -501,22 +508,24 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) ((u64)1 << edx.split.bit_width_fixed) - 1; } - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) - pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); - counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | + intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL | + INTEL_FIXED_0_USER | + INTEL_FIXED_0_ENABLE_PMI); + + counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); - pmu->global_ctrl_mask = counter_mask; + pmu->global_ctrl_rsvd = counter_rsvd; /* * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) * share reserved bit definitions. The kernel just happens to use * OVF_CTRL for the names. */ - pmu->global_status_mask = pmu->global_ctrl_mask + pmu->global_status_rsvd = pmu->global_ctrl_rsvd & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_status_mask &= + pmu->global_status_rsvd &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); @@ -535,7 +544,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) perf_capabilities = vcpu_get_perf_capabilities(vcpu); if (cpuid_model_is_consistent(vcpu) && (perf_capabilities & PMU_CAP_LBR_FMT)) - x86_perf_get_lbr(&lbr_desc->records); + memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps)); else lbr_desc->records.nr = 0; @@ -544,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (perf_capabilities & PERF_CAP_PEBS_FORMAT) { if (perf_capabilities & PERF_CAP_PEBS_BASELINE) { - pmu->pebs_enable_mask = counter_mask; + pmu->pebs_enable_rsvd = counter_rsvd; pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); - } - pmu->pebs_data_cfg_mask = ~0xff00000full; + pmu->pebs_data_cfg_rsvd = ~0xff00000full; + intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE); } else { - pmu->pebs_enable_mask = + pmu->pebs_enable_rsvd = ~((1ull << pmu->nr_arch_gp_counters) - 1); } } @@ -564,14 +570,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; pmu->gp_counters[i].current_config = 0; } - for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { + for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; @@ -731,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, - .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, }; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index af662312fd07..ec08fa3caf43 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * handle task migration (@cpu != vcpu->cpu). */ new.ndst = dest; - new.sn = 0; + __pi_clear_sn(&new); /* * Restore the notification vector; in the blocking case, the @@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); + WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking"); old.control = READ_ONCE(pi_desc->control); do { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 26992076552e..ad9116a99bcc 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -2,97 +2,8 @@ #ifndef __KVM_X86_VMX_POSTED_INTR_H #define __KVM_X86_VMX_POSTED_INTR_H -#define POSTED_INTR_ON 0 -#define POSTED_INTR_SN 1 - -#define PID_TABLE_ENTRY_VALID 1 - -/* Posted-Interrupt Descriptor */ -struct pi_desc { - u32 pir[8]; /* Posted interrupt requested */ - union { - struct { - /* bit 256 - Outstanding Notification */ - u16 on : 1, - /* bit 257 - Suppress Notification */ - sn : 1, - /* bit 271:258 - Reserved */ - rsvd_1 : 14; - /* bit 279:272 - Notification Vector */ - u8 nv; - /* bit 287:280 - Reserved */ - u8 rsvd_2; - /* bit 319:288 - Notification Destination */ - u32 ndst; - }; - u64 control; - }; - u32 rsvd[6]; -} __aligned(64); - -static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) -{ - return test_and_set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) -{ - return test_and_clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) -{ - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); -} - -static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) -{ - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); -} - -static inline void pi_set_sn(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_set_on(struct pi_desc *pi_desc) -{ - set_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_on(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline void pi_clear_sn(struct pi_desc *pi_desc) -{ - clear_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_on(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_ON, - (unsigned long *)&pi_desc->control); -} - -static inline bool pi_test_sn(struct pi_desc *pi_desc) -{ - return test_bit(POSTED_INTR_SN, - (unsigned long *)&pi_desc->control); -} +#include <linux/bitmap.h> +#include <asm/posted_intr.h> void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); @@ -103,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void vmx_pi_start_assignment(struct kvm *kvm); +static inline int pi_find_highest_vector(struct pi_desc *pi_desc) +{ + int vec; + + vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + return vec < 256 ? vec : -1; +} + #endif /* __KVM_X86_VMX_POSTED_INTR_H */ diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6fef01e0536e..9961e07cf071 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -4,12 +4,11 @@ #include <asm/sgx.h> -#include "cpuid.h" +#include "x86.h" #include "kvm_cache_regs.h" #include "nested.h" #include "sgx.h" #include "vmx.h" -#include "x86.h" bool __read_mostly enable_sgx = 1; module_param_named(sgx, enable_sgx, bool, 0444); @@ -38,7 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, fault = true; } else if (likely(is_64_bit_mode(vcpu))) { *gva = vmx_get_untagged_addr(vcpu, *gva, 0); - fault = is_noncanonical_address(*gva, vcpu); + fault = is_noncanonical_address(*gva, vcpu, 0); } else { *gva &= 0xffffffff; fault = (s.unusable) || @@ -123,7 +122,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * likely than a bad userspace address. */ if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) && - guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) { + guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) { memset(&ex, 0, sizeof(ex)); ex.vector = PF_VECTOR; ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK | @@ -274,7 +273,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu) * simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to * enforce restriction of access to the PROVISIONKEY. */ - contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT); + contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL); if (!contents) return -ENOMEM; @@ -366,7 +365,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) return true; if (leaf >= EAUG && leaf <= EMODT) - return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); + return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2); return false; } @@ -382,8 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); - if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || - !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { @@ -480,15 +479,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!cpu_has_vmx_encls_vmexit()) return; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) && sgx_enabled_in_guest_bios(vcpu)) { - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) { bitmap &= ~GENMASK_ULL(ETRACK, ECREATE); if (sgx_intercept_encls_ecreate(vcpu)) bitmap |= (1 << ECREATE); } - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) bitmap &= ~GENMASK_ULL(EMODT, EAUG); /* diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7c1996b433e2..b25625314658 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info) return is_exception_n(intr_info, NM_VECTOR); } +static inline bool is_ve_fault(u32 intr_info) +{ + return is_exception_n(intr_info, VE_VECTOR); +} + /* Undocumented: icebp/int1 */ static inline bool is_icebp(u32 intr_info) { diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 01936013428b..56fd150a6f24 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -188,12 +188,13 @@ struct __packed vmcs12 { }; /* - * VMCS12_REVISION is an arbitrary id that should be changed if the content or - * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and - * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. + * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM + * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision + * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's + * to KVM's virtual CPU definition. * - * IMPORTANT: Changing this value will break save/restore compatibility with - * older kvm releases. + * DO NOT change this value, as it will break save/restore compatibility with + * older KVM releases. */ #define VMCS12_REVISION 0x11e57ed0 @@ -206,7 +207,8 @@ struct __packed vmcs12 { #define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* - * For save/restore compatibility, the vmcs12 field offsets must not change. + * For save/restore compatibility, the vmcs12 field offsets must not change, + * although appending fields and/or filling gaps is obviously allowed. */ #define CHECK_OFFSET(field, loc) \ ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 2bfbf758d061..f6986dee6f8c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) call vmx_spec_ctrl_restore_host + CLEAR_BRANCH_HISTORY_VMEXIT + /* Put return value in AX */ mov %_ASM_BX, %_ASM_AX diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c37a89eda90f..5c5766467a61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -68,10 +68,13 @@ #include "vmcs12.h" #include "vmx.h" #include "x86.h" +#include "x86_ops.h" #include "smm.h" #include "vmx_onhyperv.h" +#include "posted_intr.h" MODULE_AUTHOR("Qumranet"); +MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); #ifdef MODULE @@ -214,9 +217,13 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -/* Default is SYSTEM mode, 1 for host-guest mode */ +/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; +#ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); +#endif + +struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); @@ -255,7 +262,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) return 0; } - if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { + if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; return 0; } @@ -400,7 +407,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) * and VM-Exit. */ vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && - (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && + (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && !boot_cpu_has_bug(X86_BUG_MDS) && !boot_cpu_has_bug(X86_BUG_TAA); @@ -476,10 +483,9 @@ noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) ext, vpid, gva); } -noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) +noinline void invept_error(unsigned long ext, u64 eptp) { - vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", - ext, eptp, gpa); + vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp); } static DEFINE_PER_CPU(struct vmcs *, vmxarea); @@ -520,16 +526,10 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) -{ - vmx->segment_cache.bitmask = 0; -} static unsigned long host_idt_base; #if IS_ENABLED(CONFIG_HYPERV) -static struct kvm_x86_ops vmx_x86_ops __initdata; - static bool __read_mostly enlightened_vmcs = true; module_param(enlightened_vmcs, bool, 0444); @@ -579,9 +579,8 @@ static __init void hv_init_evmcs(void) } if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) - vmx_x86_ops.enable_l2_tlb_flush + vt_x86_ops.enable_l2_tlb_flush = hv_enable_l2_tlb_flush; - } else { enlightened_vmcs = false; } @@ -753,7 +752,7 @@ fault: return -EIO; } -static void vmx_emergency_disable(void) +void vmx_emergency_disable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; @@ -872,6 +871,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); /* + * #VE isn't used for VMX. To test against unexpected changes + * related to #VE for VMX, intercept unexpected #VE and warn on it. + */ + if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + eb |= 1u << VE_VECTOR; + /* * Guest access to VMware backdoor ports could legitimately * trigger #GP because of TSS I/O permission bitmap. * We intercept those #GP and allow access to them anyway @@ -1116,12 +1121,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) * atomically, since it's faster than switching it manually. */ if (cpu_has_load_ia32_efer() || - (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { + (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) { if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, host_efer, false); + guest_efer, kvm_host.efer, false); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -1134,7 +1139,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, MSR_EFER); guest_efer &= ~ignore_bits; - guest_efer |= host_efer & ignore_bits; + guest_efer |= kvm_host.efer & ignore_bits; vmx->guest_uret_msrs[i].data = guest_efer; vmx->guest_uret_msrs[i].mask = ~ignore_bits; @@ -1404,6 +1409,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) } #endif +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old, ple_window, + ple_window_grow, + ple_window_max); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, ple_window, + ple_window_shrink, + ple_window); + + if (vmx->ple_window != old) { + vmx->ple_window_dirty = true; + trace_kvm_ple_window_update(vcpu->vcpu_id, + vmx->ple_window, old); + } +} + void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, struct loaded_vmcs *buddy) { @@ -1440,7 +1477,8 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * performs IBPB on nested VM-Exit (a single nested transition * may switch the active VMCS multiple times). */ - if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) + if (static_branch_likely(&switch_vcpu_ibpb) && + (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) indirect_branch_prediction_barrier(); } @@ -1475,18 +1513,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. */ -static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) + shrink_ple_window(vcpu); vmx_vcpu_load_vmcs(vcpu, cpu, NULL); vmx_vcpu_pi_load(vcpu, cpu); - - vmx->host_debugctlmsr = get_debugctlmsr(); } -static void vmx_vcpu_put(struct kvm_vcpu *vcpu) +void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); @@ -1545,7 +1582,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmx->emulation_required = vmx_emulation_required(vcpu); } -static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) +bool vmx_get_if_flag(struct kvm_vcpu *vcpu) { return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; } @@ -1596,7 +1633,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * result in a #GP unless the same write also clears TraceEn. */ if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && - ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) + (data & RTIT_CTL_TRACEEN) && + data != vmx->pt_desc.guest.ctl) return 1; /* @@ -1651,8 +1689,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, - void *insn, int insn_len) +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) { /* * Emulation of instructions in SGX enclaves is impossible as RIP does @@ -1665,6 +1703,12 @@ static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } + + /* Check that emulation is possible during event vectoring */ + if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + !kvm_can_emulate_event_vectoring(emul_type)) + return X86EMUL_UNHANDLEABLE_VECTORING; + return X86EMUL_CONTINUE; } @@ -1736,7 +1780,7 @@ rip_updated: * Recognizes a pending MTF VM-exit and records the nested state for later * delivery. */ -static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -1767,7 +1811,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) } } -static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) { vmx_update_emulated_instruction(vcpu); return skip_emulated_instruction(vcpu); @@ -1786,7 +1830,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_inject_exception(struct kvm_vcpu *vcpu) +void vmx_inject_exception(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; @@ -1868,8 +1912,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); vmx_setup_uret_msr(vmx, MSR_TSC_AUX, - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID)); /* * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new @@ -1907,12 +1951,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) return kvm_caps.default_tsc_scaling_ratio; } -static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); } -static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) { vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); } @@ -1955,15 +1999,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, return !(msr->data & ~valid_bits); } -static int vmx_get_msr_feature(struct kvm_msr_entry *msr) +int vmx_get_feature_msr(u32 msr, u64 *data) { - switch (msr->index) { + switch (msr) { case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!nested) return 1; - return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + return vmx_get_vmx_msr(&vmcs_config.nested, msr, data); default: - return KVM_MSR_RET_INVALID; + return KVM_MSR_RET_UNSUPPORTED; } } @@ -1972,7 +2016,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2022,7 +2066,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -2038,13 +2082,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) return 1; msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; break; case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data)) @@ -2057,7 +2101,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * sanity checking and refuse to boot. Filter all unsupported * features out. */ - if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu)) nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); #endif @@ -2127,7 +2171,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, u64 data) { #ifdef CONFIG_X86_64 - if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return (u32)data; #endif return (unsigned long)data; @@ -2138,7 +2182,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated u64 debugctl = 0; if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && - (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && @@ -2153,7 +2197,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmx_uret_msr *msr; @@ -2242,9 +2286,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) + !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX))) return 1; - if (is_noncanonical_address(data & PAGE_MASK, vcpu) || + if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) || (data & MSR_IA32_BNDCFGS_RSVD)) return 1; @@ -2344,7 +2388,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * behavior, but it's close enough. */ if (!msr_info->host_initiated && - (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) || ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) return 1; @@ -2354,7 +2398,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: if (!msr_info->host_initiated) return 1; /* they are read-only */ - if (!guest_can_use(vcpu, X86_FEATURE_VMX)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: @@ -2409,7 +2453,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; if (index % 2) vmx->pt_desc.guest.addr_b[index / 2] = data; @@ -2417,8 +2461,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx->pt_desc.guest.addr_a[index / 2] = data; break; case MSR_IA32_PERF_CAPABILITIES: - if (data && !vcpu_to_pmu(vcpu)->version) - return 1; if (data & PMU_CAP_LBR_FMT) { if ((data & PMU_CAP_LBR_FMT) != (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) @@ -2430,9 +2472,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((data & PERF_CAP_PEBS_MASK) != (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS)) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64)) return 1; if (!cpuid_model_is_consistent(vcpu)) return 1; @@ -2456,7 +2498,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return ret; } -static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { unsigned long guest_owned_bits; @@ -2510,30 +2552,6 @@ static bool cpu_has_sgx(void) return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } -/* - * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they - * can't be used due to errata where VM Exit may incorrectly clear - * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the - * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. - */ -static bool cpu_has_perf_global_ctrl_bug(void) -{ - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ - case INTEL_FAM6_NEHALEM: /* AAP115 */ - case INTEL_FAM6_WESTMERE: /* AAT100 */ - case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ - case INTEL_FAM6_NEHALEM_EX: /* BA97 */ - return true; - default: - break; - } - } - - return false; -} - static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { u32 vmx_msr_low, vmx_msr_high; @@ -2561,18 +2579,45 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) return ctl_opt & allowed; } +#define vmx_check_entry_exit_pairs(pairs, entry_controls, exit_controls) \ +({ \ + int i, r = 0; \ + \ + BUILD_BUG_ON(sizeof(pairs[0].entry_control) != sizeof(entry_controls)); \ + BUILD_BUG_ON(sizeof(pairs[0].exit_control) != sizeof(exit_controls)); \ + \ + for (i = 0; i < ARRAY_SIZE(pairs); i++) { \ + typeof(entry_controls) n_ctrl = pairs[i].entry_control; \ + typeof(exit_controls) x_ctrl = pairs[i].exit_control; \ + \ + if (!(entry_controls & n_ctrl) == !(exit_controls & x_ctrl)) \ + continue; \ + \ + pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, " \ + "entry = %llx (%llx), exit = %llx (%llx)\n", \ + (u64)(entry_controls & n_ctrl), (u64)n_ctrl, \ + (u64)(exit_controls & x_ctrl), (u64)x_ctrl); \ + \ + if (error_on_inconsistent_vmcs_config) \ + r = -EIO; \ + \ + entry_controls &= ~n_ctrl; \ + exit_controls &= ~x_ctrl; \ + } \ + r; \ +}) + static int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { - u32 vmx_msr_low, vmx_msr_high; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 basic_msr; u64 misc_msr; - int i; /* * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. @@ -2604,6 +2649,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_cpu_based_2nd_exec_control)) return -EIO; } + if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE)) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; + #ifndef CONFIG_X86_64 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) @@ -2628,6 +2676,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, return -EIO; vmx_cap->ept = 0; + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; } if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && vmx_cap->vpid) { @@ -2672,46 +2721,54 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, &_vmentry_control)) return -EIO; - for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { - u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; - u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; - - if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) - continue; - - pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", - _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); - - if (error_on_inconsistent_vmcs_config) - return -EIO; + if (vmx_check_entry_exit_pairs(vmcs_entry_exit_pairs, + _vmentry_control, _vmexit_control)) + return -EIO; - _vmentry_control &= ~n_ctrl; - _vmexit_control &= ~x_ctrl; + /* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to an errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ + switch (boot_cpu_data.x86_vfm) { + case INTEL_NEHALEM_EP: /* AAK155 */ + case INTEL_NEHALEM: /* AAP115 */ + case INTEL_WESTMERE: /* AAT100 */ + case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_NEHALEM_EX: /* BA97 */ + _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; } - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); + rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ - if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) + if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) return -EIO; #ifdef CONFIG_X86_64 - /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ - if (vmx_msr_high & (1u<<16)) + /* + * KVM expects to be able to shove all legal physical addresses into + * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always + * 0 for processors that support Intel 64 architecture". + */ + if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EIO; #endif /* Require Write-Back (WB) memory type for VMCS accesses. */ - if (((vmx_msr_high >> 18) & 15) != 6) + if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; rdmsrl(MSR_IA32_VMX_MISC, misc_msr); - vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; - - vmcs_conf->revision_id = vmx_msr_low; - + vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; @@ -2757,7 +2814,7 @@ static bool kvm_is_vmx_supported(void) return supported; } -static int vmx_check_processor_compat(void) +int vmx_check_processor_compat(void) { int cpu = raw_smp_processor_id(); struct vmcs_config vmcs_conf; @@ -2799,7 +2856,7 @@ fault: return -EFAULT; } -static int vmx_hardware_enable(void) +int vmx_enable_virtualization_cpu(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); @@ -2823,9 +2880,6 @@ static int vmx_hardware_enable(void) return r; } - if (enable_ept) - ept_sync_global(); - return 0; } @@ -2839,7 +2893,7 @@ static void vmclear_local_loaded_vmcss(void) __loaded_vmcs_clear(v); } -static void vmx_hardware_disable(void) +void vmx_disable_virtualization_cpu(void) { vmclear_local_loaded_vmcss(); @@ -2861,13 +2915,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) if (!pages) return NULL; vmcs = page_address(pages); - memset(vmcs, 0, vmcs_config.size); + memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic)); /* KVM supports Enlightened VMCS v1 only */ if (kvm_is_using_evmcs()) vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); if (shadow) vmcs->hdr.shadow_vmcs = 1; @@ -2960,7 +3014,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (kvm_is_using_evmcs()) - vmcs->hdr.revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic); per_cpu(vmxarea, cpu) = vmcs; } @@ -3153,7 +3207,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3178,12 +3232,12 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } -static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; u64 root_hpa = mmu->root.hpa; @@ -3199,7 +3253,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) vpid_sync_context(vmx_get_current_vpid(vcpu)); } -static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in @@ -3208,7 +3262,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } -static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a @@ -3253,7 +3307,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ CPU_BASED_CR3_STORE_EXITING) -static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (is_guest_mode(vcpu)) return nested_guest_cr0_valid(vcpu, cr0); @@ -3374,8 +3428,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) return eptp; } -static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, - int root_level) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3404,8 +3457,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, vmcs_writel(GUEST_CR3, guest_cr3); } - -static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { /* * We operate under the default treatment of SMM, so VMX cannot be @@ -3482,7 +3534,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -3521,7 +3573,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->g = (ar >> 15) & 1; } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment s; @@ -3532,16 +3584,29 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int ar; if (unlikely(vmx->rmode.vm86_active)) return 0; - else { - int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); - return VMX_AR_DPL(ar); - } + + if (no_cache) + ar = vmcs_read32(GUEST_SS_AR_BYTES); + else + ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); + return VMX_AR_DPL(ar); +} + +int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, false); +} + +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu) +{ + return __vmx_get_cpl(vcpu, true); } static u32 vmx_segment_access_rights(struct kvm_segment *var) @@ -3598,14 +3663,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); } -static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); } -static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); @@ -3613,25 +3678,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_IDTR_LIMIT); dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_IDTR_LIMIT, dt->size); vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { dt->size = vmcs_read32(GUEST_GDTR_LIMIT); dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { vmcs_write32(GUEST_GDTR_LIMIT, dt->size); vmcs_writel(GUEST_GDTR_BASE, dt->address); @@ -4099,27 +4164,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - void *vapic_page; - u32 vppr; - int rvi; - - if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || - !nested_cpu_has_vid(get_vmcs12(vcpu)) || - WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) - return false; - - rvi = vmx_get_rvi(); - - vapic_page = vmx->nested.virtual_apic_map.hva; - vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - -static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; @@ -4199,6 +4244,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated + * and freed, and must not be accessed outside of vcpu->mutex. The + * vCPU's cached PI NV is valid if and only if posted interrupts + * enabled in its vmcs12, i.e. checking the vector also checks that + * L1 has enabled posted interrupts for L2. + */ if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { /* @@ -4263,8 +4315,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } -static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, - int trig_mode, int vector) +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) { struct kvm_vcpu *vcpu = apic->vcpu; @@ -4348,7 +4400,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) } if (cpu_has_load_ia32_efer()) - vmcs_write64(HOST_IA32_EFER, host_efer); + vmcs_write64(HOST_IA32_EFER, kvm_host.efer); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4397,9 +4449,6 @@ static u32 vmx_vmentry_ctrl(void) VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_IA32E_MODE); - if (cpu_has_perf_global_ctrl_bug()) - vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - return vmentry_ctrl; } @@ -4417,16 +4466,12 @@ static u32 vmx_vmexit_ctrl(void) if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); - - if (cpu_has_perf_global_ctrl_bug()) - vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); } -static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4536,7 +4581,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * Update the nested MSR settings so that a nested VMM can/can't set * controls for features that are/aren't exposed to the guest. */ - if (nested) { + if (nested && + kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { /* * All features that can be added or removed to VMX MSRs must * be supported in the first place for nested virtualization. @@ -4562,10 +4608,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, bool __enabled; \ \ if (cpu_has_vmx_##name()) { \ - if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ - __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ - else \ - __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ + __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \ vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ __enabled, exiting); \ } \ @@ -4592,6 +4635,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE; enable_unrestricted_guest = 0; } if (!enable_unrestricted_guest) @@ -4640,8 +4684,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) */ if (cpu_has_vmx_rdtscp()) { bool rdpid_or_rdtscp_enabled = - guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || - guest_cpuid_has(vcpu, X86_FEATURE_RDPID); + guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) || + guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID); vmx_adjust_secondary_exec_control(vmx, &exec_control, SECONDARY_EXEC_ENABLE_RDTSCP, @@ -4690,7 +4734,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) return 0; } -static int vmx_vcpu_precreate(struct kvm *kvm) +int vmx_vcpu_precreate(struct kvm *kvm) { return vmx_alloc_ipiv_pid_table(kvm); } @@ -4715,8 +4759,12 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, + __pa(vmx->ve_info)); + } if (cpu_has_tertiary_exec_ctrls()) tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); @@ -4787,7 +4835,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (enable_pml) { vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } vmx_write_encls_bitmap(&vmx->vcpu, NULL); @@ -4821,7 +4869,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcs(vmx); - if (nested) + if (nested && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); vcpu_setup_sgx_lepubkeyhash(vcpu); @@ -4834,7 +4883,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; #endif - vcpu->arch.microcode_version = 0x100000000ULL; + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) + vcpu->arch.microcode_version = 0x100000000ULL; vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; /* @@ -4842,10 +4892,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * or POSTED_INTR_WAKEUP_VECTOR. */ vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; + __pi_set_sn(&vmx->pi_desc); } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4860,9 +4910,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); - seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); @@ -4889,6 +4936,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); @@ -4904,12 +4954,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx_update_fb_clear_dis(vcpu, vmx); } -static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) +void vmx_enable_irq_window(struct kvm_vcpu *vcpu) { exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); } -static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) { if (!enable_vnmi || vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { @@ -4920,7 +4970,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); } -static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) { struct vcpu_vmx *vmx = to_vmx(vcpu); uint32_t intr; @@ -4948,7 +4998,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) vmx_clear_hlt(vcpu); } -static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +void vmx_inject_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5026,7 +5076,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5038,17 +5088,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_nmi_blocked(vcpu); } +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) +{ + return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); +} + bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) return false; - return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); + return __vmx_interrupt_blocked(vcpu); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return -EBUSY; @@ -5063,7 +5118,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !vmx_interrupt_blocked(vcpu); } -static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { void __user *ret; @@ -5083,7 +5138,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return init_rmode_tss(kvm, ret); } -static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; return 0; @@ -5171,6 +5226,12 @@ bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); } +static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_fpu.fpstate->xfd && + !kvm_is_cr0_bit_set(vcpu, X86_CR0_TS); +} + static int handle_exception_nmi(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5197,13 +5258,24 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * point. */ if (is_nm_fault(intr_info)) { - kvm_queue_exception(vcpu, NM_VECTOR); + kvm_queue_exception_p(vcpu, NM_VECTOR, + is_xfd_nm_fault(vcpu) ? vcpu->arch.guest_fpu.xfd_err : 0); return 1; } if (is_invalid_opcode(intr_info)) return handle_ud(vcpu); + if (WARN_ON_ONCE(is_ve_fault(intr_info))) { + struct vmx_ve_information *ve_info = vmx->ve_info; + + WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION, + "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason); + dump_vmcs(vcpu); + kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE"); + return 1; + } + error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); @@ -5369,8 +5441,7 @@ static int handle_io(struct kvm_vcpu *vcpu) return kvm_fast_pio(vcpu, size, port, in); } -static void -vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) { /* * Patch in the VMCALL instruction: @@ -5576,7 +5647,7 @@ out: return kvm_complete_insn_gp(vcpu, err); } -static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); @@ -5595,7 +5666,13 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); } @@ -5762,13 +5839,12 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) ? PFERR_FETCH_MASK : 0; /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) + error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) ? PFERR_PRESENT_MASK : 0; - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - - vcpu->arch.exit_qualification = exit_qualification; + if (error_code & EPT_VIOLATION_GVA_IS_VALID) + error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? + PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; /* * Check that the GPA doesn't exceed physical memory limits, as that is @@ -5817,11 +5893,35 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } -static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +/* + * Returns true if emulation is required (due to the vCPU having invalid state + * with unsrestricted guest mode disabled) and KVM can't faithfully emulate the + * current vCPU state. + */ +static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - return vmx->emulation_required && !vmx->rmode.vm86_active && + if (!vmx->emulation_required) + return false; + + /* + * It is architecturally impossible for emulation to be required when a + * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if + * guest state is invalid and unrestricted guest is disabled, i.e. KVM + * should synthesize VM-Fail instead emulation L2 code. This path is + * only reachable if userspace modifies L2 guest state after KVM has + * performed the nested VM-Enter consistency checks. + */ + if (vmx->nested.nested_run_pending) + return true; + + /* + * KVM only supports emulating exceptions if the vCPU is in Real Mode. + * If emulation is required, KVM can't perform a successful VM-Enter to + * inject the exception. + */ + return !vmx->rmode.vm86_active && (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } @@ -5844,7 +5944,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5866,9 +5966,9 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } -static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) { - if (vmx_emulation_required_with_pending_exception(vcpu)) { + if (vmx_unhandleable_emulation_required(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5876,38 +5976,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) return 1; } -static void grow_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __grow_ple_window(old, ple_window, - ple_window_grow, - ple_window_max); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - -static void shrink_ple_window(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned int old = vmx->ple_window; - - vmx->ple_window = __shrink_ple_window(old, ple_window, - ple_window_shrink, - ple_window); - - if (vmx->ple_window != old) { - vmx->ple_window_dirty = true; - trace_kvm_ple_window_update(vcpu->vcpu_id, - vmx->ple_window, old); - } -} - /* * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE * exiting, so only get here on cpu with PAUSE-Loop-Exiting. @@ -5943,7 +6011,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } operand; int gpr_index; - if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -6033,7 +6101,7 @@ static int handle_preemption_timer(struct kvm_vcpu *vcpu) /* * When nested=0, all VMX instruction VM Exits filter here. The handlers - * are overwritten by nested_vmx_setup() when nested=1. + * are overwritten by nested_vmx_hardware_setup() when nested=1. */ static int handle_vmx_instruction(struct kvm_vcpu *vcpu) { @@ -6154,9 +6222,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, - u64 *info1, u64 *info2, - u32 *intr_info, u32 *error_code) +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6176,6 +6243,15 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, } } +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code) +{ + *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + if (is_exception_with_error_code(*intr_info)) + *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); + else + *error_code = 0; +} + static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { @@ -6187,32 +6263,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u16 pml_idx, pml_tail_index; u64 *pml_buf; - u16 pml_idx; + int i; pml_idx = vmcs_read16(GUEST_PML_INDEX); /* Do nothing if PML buffer is empty */ - if (pml_idx == (PML_ENTITY_NUM - 1)) + if (pml_idx == PML_HEAD_INDEX) return; + /* + * PML index always points to the next available PML buffer entity + * unless PML log has just overflowed. + */ + pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1; - /* PML index always points to next available PML buffer entity */ - if (pml_idx >= PML_ENTITY_NUM) - pml_idx = 0; - else - pml_idx++; - + /* + * PML log is written backwards: the CPU first writes the entry 511 + * then the entry 510, and so on. + * + * Read the entries in the same order they were written, to ensure that + * the dirty ring is filled in the same order the CPU wrote them. + */ pml_buf = page_address(vmx->pml_pg); - for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { + + for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) { u64 gpa; - gpa = pml_buf[pml_idx]; + gpa = pml_buf[i]; WARN_ON(gpa & (PAGE_SIZE - 1)); kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX); } static void vmx_dump_sel(char *name, uint32_t sel) @@ -6414,6 +6498,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) pr_err("Virtual processor ID = 0x%04x\n", vmcs_read16(VIRTUAL_PROCESSOR_ID)); + if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct vmx_ve_information *ve_info = vmx->ve_info; + u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS); + + /* + * If KVM is dumping the VMCS, then something has gone wrong + * already. Derefencing an address from the VMCS, which could + * very well be corrupted, is a terrible idea. The virtual + * address is known so use it. + */ + pr_err("VE info address = 0x%016llx%s\n", ve_info_pa, + ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)"); + pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n", + ve_info->exit_reason, ve_info->delivery, + ve_info->exit_qualification, + ve_info->guest_linear_address, + ve_info->guest_physical_address, ve_info->eptp_index); + } } /* @@ -6510,33 +6612,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return 0; } - /* - * Note: - * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by - * delivery event since it indicates guest is accessing MMIO. - * The vm-exit can be triggered again after return to guest that - * will cause infinite loop. - */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && exit_reason.basic != EXIT_REASON_EPT_VIOLATION && exit_reason.basic != EXIT_REASON_PML_FULL && exit_reason.basic != EXIT_REASON_APIC_ACCESS && exit_reason.basic != EXIT_REASON_TASK_SWITCH && - exit_reason.basic != EXIT_REASON_NOTIFY)) { - int ndata = 3; - - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.data[0] = vectoring_info; - vcpu->run->internal.data[1] = exit_reason.full; - vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); - if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { - vcpu->run->internal.data[ndata++] = - vmcs_read64(GUEST_PHYSICAL_ADDRESS); - } - vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; - vcpu->run->internal.ndata = ndata; + exit_reason.basic != EXIT_REASON_NOTIFY && + exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) { + kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA); return 0; } @@ -6599,7 +6683,7 @@ unexpected_vmexit: return 0; } -static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { int ret = __vmx_handle_exit(vcpu, exit_fastpath); @@ -6639,9 +6723,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) bool flush_l1d; /* - * Clear the per-vcpu flush bit, it gets set again - * either from vcpu_run() or from one of the unsafe - * VMEXIT handlers. + * Clear the per-vcpu flush bit, it gets set again if the vCPU + * is reloaded, i.e. if the vCPU is scheduled out or if KVM + * exits to userspace, or if KVM reaches one of the unsafe + * VMEXIT handlers, e.g. if KVM calls into the emulator. */ flush_l1d = vcpu->arch.l1tf_flush_l1d; vcpu->arch.l1tf_flush_l1d = false; @@ -6687,7 +6772,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) : "eax", "ebx", "ecx", "edx"); } -static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int tpr_threshold; @@ -6757,14 +6842,16 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap_x2apic(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; + struct page *refcounted_page; unsigned long mmu_seq; kvm_pfn_t pfn; + bool writable; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6800,37 +6887,58 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear. */ - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - read_unlock(&vcpu->kvm->mmu_lock); - goto out; - } + else + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - read_unlock(&vcpu->kvm->mmu_lock); + /* + * Do not pin the APIC access page in memory so that it can be freely + * migrated, the MMU notifier will call us again if it is migrated or + * swapped out. KVM backs the memslot with anonymous memory, the pfn + * should always point at a refcounted page (if the pfn is valid). + */ + if (!WARN_ON_ONCE(!refcounted_page)) + kvm_release_page_clean(refcounted_page); /* * No need for a manual TLB flush at this point, KVM has already done a * flush if there were SPTEs pointing at the previous page. */ -out: - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - kvm_release_pfn_clean(pfn); + read_unlock(&vcpu->kvm->mmu_lock); } -static void vmx_hwapic_isr_update(int max_isr) +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; + /* + * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI + * is only relevant for if and only if Virtual Interrupt Delivery is + * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's + * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested + * VM-Exit, otherwise L1 with run with a stale SVI. + */ + if (is_guest_mode(vcpu)) { + /* + * KVM is supposed to forward intercepted L2 EOIs to L1 if VID + * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC. + * Note, userspace can stuff state while L2 is active; assert + * that VID is disabled if and only if the vCPU is in KVM_RUN + * to avoid false positives if userspace is setting APIC state. + */ + WARN_ON_ONCE(vcpu->wants_to_run && + nested_cpu_has_vid(get_vmcs12(vcpu))); + to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true; + return; + } + if (max_isr == -1) max_isr = 0; @@ -6860,21 +6968,7 @@ static void vmx_set_rvi(int vector) } } -static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) -{ - /* - * When running L2, updating RVI is only relevant when - * vmcs12 virtual-interrupt-delivery enabled. - * However, it can be enabled only when L1 also - * intercepts external-interrupts and in that case - * we should not update vmcs02 RVI but instead intercept - * interrupt. Therefore, do nothing when running L2. - */ - if (!is_guest_mode(vcpu)) - vmx_set_rvi(max_irr); -} - -static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -6920,7 +7014,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } -static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -6931,7 +7025,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) +void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6949,37 +7043,34 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * MSR value is not clobbered by the host activity before the guest * has chance to consume it. * - * Do not blindly read xfd_err here, since this exception might - * be caused by L1 interception on a platform which doesn't - * support xfd at all. + * Update the guest's XFD_ERR if and only if XFD is enabled, as the #NM + * interception may have been caused by L1 interception. Per the SDM, + * XFD_ERR is not modified for non-XFD #NM, i.e. if CR0.TS=1. * - * Do it conditionally upon guest_fpu::xfd. xfd_err matters - * only when xfd contains a non-zero value. - * - * Queuing exception is done in vmx_handle_exit. See comment there. + * Note, XFD_ERR is updated _before_ the #NM interception check, i.e. + * unlike CR2 and DR6, the value is not a payload that is attached to + * the #NM exception. */ - if (vcpu->arch.guest_fpu.fpstate->xfd) + if (is_xfd_nm_fault(vcpu)) rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } -static void handle_exception_irqoff(struct vcpu_vmx *vmx) +static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) { - u32 intr_info = vmx_get_intr_info(&vmx->vcpu); - /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) - vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); + vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* if exit due to NM, handle before interrupts are enabled */ else if (is_nm_fault(intr_info)) - handle_nm_fault_irqoff(&vmx->vcpu); + handle_nm_fault_irqoff(vcpu); /* Handle machine checks before interrupts are enabled */ else if (is_machine_check(intr_info)) kvm_machine_check(); } -static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) +static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, + u32 intr_info) { - u32 intr_info = vmx_get_intr_info(vcpu); unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, @@ -6996,7 +7087,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) vcpu->arch.at_instruction_boundary = true; } -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7004,16 +7095,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) return; if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) - handle_external_interrupt_irqoff(vcpu); + handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) - handle_exception_irqoff(vmx); + handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } /* * The kvm parameter can be NULL (module initialization, or invocation before * VM creation). Be sure to check the kvm parameter before using it. */ -static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) { switch (index) { case MSR_IA32_SMBASE: @@ -7111,13 +7202,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; @@ -7136,7 +7231,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) IDT_VECTORING_ERROR_CODE); } -static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +void vmx_cancel_injection(struct kvm_vcpu *vcpu) { __vmx_complete_interrupts(vcpu, vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), @@ -7244,6 +7339,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); + case EXIT_REASON_HLT: + return handle_fastpath_hlt(vcpu); default: return EXIT_FASTPATH_NONE; } @@ -7306,7 +7403,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7371,10 +7468,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -7410,8 +7503,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) } /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (vmx->host_debugctlmsr) - update_debugctlmsr(vmx->host_debugctlmsr); + if (vcpu->arch.host_debugctl) + update_debugctlmsr(vcpu->arch.host_debugctl); #ifndef CONFIG_X86_64 /* @@ -7461,7 +7554,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } -static void vmx_vcpu_free(struct kvm_vcpu *vcpu) +void vmx_vcpu_free(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7470,9 +7563,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu) free_vpid(vmx->vpid); nested_vmx_free_vcpu(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); + free_page((unsigned long)vmx->ve_info); } -static int vmx_vcpu_create(struct kvm_vcpu *vcpu) +int vmx_vcpu_create(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; @@ -7563,6 +7657,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu) goto free_vmcs; } + err = -ENOMEM; + if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) { + struct page *page; + + BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE); + + /* ve_info must be page aligned. */ + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!page) + goto free_vmcs; + + vmx->ve_info = page_to_virt(page); + } + if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); @@ -7581,7 +7689,7 @@ free_vpid: #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" -static int vmx_vm_init(struct kvm *kvm) +int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) kvm->arch.pause_in_guest = true; @@ -7612,41 +7720,25 @@ static int vmx_vm_init(struct kvm *kvm) return 0; } -static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in - * memory aliases with conflicting memory types and sometimes MCEs. - * We have to be careful as to what are honored and when. - * - * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to - * UC. The effective memory type is UC or WC depending on guest PAT. - * This was historically the source of MCEs and we want to be - * conservative. - * - * When there is no need to deal with noncoherent DMA (e.g., no VT-d - * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The - * EPT memory type is set to WB. The effective memory type is forced - * WB. - * - * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The - * EPT memory type is used to emulate guest CD/MTRR. + /* + * Force UC for host MMIO regions, as allowing the guest to access MMIO + * with cacheable accesses will result in Machine Checks. */ - if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; + /* + * Force WB and ignore guest PAT if the VM does NOT have a non-coherent + * device attached. Letting the guest control memory types on Intel + * CPUs may result in unexpected behavior, and so KVM's ABI is to trust + * the guest to behave only as a last resort. + */ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; - if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { - if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT; - else - return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) | - VMX_EPT_IPAT_BIT; - } - - return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; + return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); } static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) @@ -7784,7 +7876,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7793,12 +7885,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be * set if and only if XSAVE is supported. */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); - - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); - kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE)) + guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES); vmx_setup_uret_msrs(vmx); @@ -7806,7 +7894,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmcs_set_secondary_exec_control(vmx, vmx_secondary_exec_control(vmx)); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; @@ -7815,25 +7903,25 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); - if (guest_can_use(vcpu, X86_FEATURE_VMX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) nested_vmx_cr_fixed1_bits_update(vcpu); if (boot_cpu_has(X86_FEATURE_INTEL_PT) && - guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) + guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); if (boot_cpu_has(X86_FEATURE_RTM)) { struct vmx_uret_msr *msr; msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); if (msr) { - bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM); vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); } } if (kvm_cpu_cap_has(X86_FEATURE_XFD)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); if (boot_cpu_has(X86_FEATURE_IBPB)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, @@ -7841,17 +7929,17 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; else vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; - if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC)) vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_LC_ENABLED; else @@ -7862,10 +7950,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx_update_exception_bitmap(vcpu); } -static u64 vmx_get_perf_capabilities(void) +static __init u64 vmx_get_perf_capabilities(void) { u64 perf_cap = PMU_CAP_FW_WRITES; - struct x86_pmu_lbr lbr; u64 host_perf_cap = 0; if (!enable_pmu) @@ -7875,15 +7962,43 @@ static u64 vmx_get_perf_capabilities(void) rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { - x86_perf_get_lbr(&lbr); - if (lbr.nr) + x86_perf_get_lbr(&vmx_lbr_caps); + + /* + * KVM requires LBR callstack support, as the overhead due to + * context switching LBRs without said support is too high. + * See intel_pmu_create_guest_lbr_event() for more info. + */ + if (!vmx_lbr_caps.has_callstack) + memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps)); + else if (vmx_lbr_caps.nr) perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; } if (vmx_pebs_supported()) { perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; - if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4) - perf_cap &= ~PERF_CAP_PEBS_BASELINE; + + /* + * Disallow adaptive PEBS as it is functionally broken, can be + * used by the guest to read *host* LBRs, and can be used to + * bypass userspace event filters. To correctly and safely + * support adaptive PEBS, KVM needs to: + * + * 1. Account for the ADAPTIVE flag when (re)programming fixed + * counters. + * + * 2. Gain support from perf (or take direct control of counter + * programming) to support events without adaptive PEBS + * enabled for the hardware counter. + * + * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with + * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. + * + * 4. Document which PMU events are effectively exposed to the + * guest via adaptive PEBS, and make adaptive PEBS mutually + * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. + */ + perf_cap &= ~PERF_CAP_PEBS_BASELINE; } return perf_cap; @@ -7918,6 +8033,7 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); kvm_cpu_cap_clear(X86_FEATURE_SGX1); kvm_cpu_cap_clear(X86_FEATURE_SGX2); + kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA); } if (vmx_umip_emulated()) @@ -7938,66 +8054,86 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; + bool imm; + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; + imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; + imm = info->dst_type == OP_IMM; } - /* - * If the 'use IO bitmaps' VM-execution control is 0, IO instruction - * VM-exits depend on the 'unconditional IO exiting' VM-execution - * control. - * - * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. - */ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + + return nested_vmx_check_io_bitmaps(vcpu, port, size); } -static int vmx_check_intercept(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info, - enum x86_intercept_stage stage, - struct x86_exception *exception) +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8010,7 +8146,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ + break; + + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8023,17 +8176,24 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 @@ -8055,8 +8215,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, return 0; } -static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, - bool *expired) +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; @@ -8095,18 +8255,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, return 0; } -static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif -static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - if (!kvm_pause_in_guest(vcpu->kvm)) - shrink_ple_window(vcpu); -} - void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8130,7 +8284,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static void vmx_setup_mce(struct kvm_vcpu *vcpu) +void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -8141,7 +8295,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_SMM -static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) @@ -8149,7 +8303,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) return !is_smm(vcpu); } -static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8170,7 +8324,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) return 0; } -static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -8191,18 +8345,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) return 0; } -static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) +void vmx_enable_smi_window(struct kvm_vcpu *vcpu) { /* RSM will cause a vmexit anyway. */ } #endif -static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) { return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); } -static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +void vmx_migrate_timers(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; @@ -8212,7 +8366,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) } } -static void vmx_hardware_unsetup(void) +void vmx_hardware_unsetup(void) { kvm_set_posted_intr_wakeup_handler(NULL); @@ -8222,18 +8376,7 @@ static void vmx_hardware_unsetup(void) free_kvm_area(); } -#define VMX_REQUIRED_APICV_INHIBITS \ -( \ - BIT(APICV_INHIBIT_REASON_DISABLE)| \ - BIT(APICV_INHIBIT_REASON_ABSENT) | \ - BIT(APICV_INHIBIT_REASON_HYPERV) | \ - BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ - BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ - BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ - BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ -) - -static void vmx_vm_destroy(struct kvm *kvm) +void vmx_vm_destroy(struct kvm *kvm) { struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); @@ -8284,148 +8427,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63)); } -static struct kvm_x86_ops vmx_x86_ops __initdata = { - .name = KBUILD_MODNAME, - - .check_processor_compatibility = vmx_check_processor_compat, - - .hardware_unsetup = vmx_hardware_unsetup, - - .hardware_enable = vmx_hardware_enable, - .hardware_disable = vmx_hardware_disable, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_size = sizeof(struct kvm_vmx), - .vm_init = vmx_vm_init, - .vm_destroy = vmx_vm_destroy, - - .vcpu_precreate = vmx_vcpu_precreate, - .vcpu_create = vmx_vcpu_create, - .vcpu_free = vmx_vcpu_free, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_switch_to_guest = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_exception_bitmap = vmx_update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .is_valid_cr0 = vmx_is_valid_cr0, - .set_cr0 = vmx_set_cr0, - .is_valid_cr4 = vmx_is_valid_cr4, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - .get_if_flag = vmx_get_if_flag, - - .flush_tlb_all = vmx_flush_tlb_all, - .flush_tlb_current = vmx_flush_tlb_current, - .flush_tlb_gva = vmx_flush_tlb_gva, - .flush_tlb_guest = vmx_flush_tlb_guest, - - .vcpu_pre_run = vmx_vcpu_pre_run, - .vcpu_run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .inject_irq = vmx_inject_irq, - .inject_nmi = vmx_inject_nmi, - .inject_exception = vmx_inject_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = vmx_enable_nmi_window, - .enable_irq_window = vmx_enable_irq_window, - .update_cr8_intercept = vmx_update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_pre_state_restore = vmx_apicv_pre_state_restore, - .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_interrupt = vmx_deliver_interrupt, - .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .get_l2_tsc_offset = vmx_get_l2_tsc_offset, - .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, - .write_tsc_offset = vmx_write_tsc_offset, - .write_tsc_multiplier = vmx_write_tsc_multiplier, - - .load_mmu_pgd = vmx_load_mmu_pgd, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - - .sched_in = vmx_sched_in, - - .cpu_dirty_log_size = PML_ENTITY_NUM, - .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - - .nested_ops = &vmx_nested_ops, - - .pi_update_irte = vmx_pi_update_irte, - .pi_start_assignment = vmx_pi_start_assignment, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - -#ifdef CONFIG_KVM_SMM - .smi_allowed = vmx_smi_allowed, - .enter_smm = vmx_enter_smm, - .leave_smm = vmx_leave_smm, - .enable_smi_window = vmx_enable_smi_window, -#endif - - .check_emulate_instruction = vmx_check_emulate_instruction, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, - .migrate_timers = vmx_migrate_timers, - - .msr_filter_changed = vmx_msr_filter_changed, - .complete_emulated_msr = kvm_complete_insn_gp, - - .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, - - .get_untagged_addr = vmx_get_untagged_addr, -}; - static unsigned int vmx_handle_intel_pt_intr(void) { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); @@ -8471,18 +8472,16 @@ static void __init vmx_setup_me_spte_mask(void) u64 me_mask = 0; /* - * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use - * the former to avoid exposing shadow_phys_bits. - * * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to - * shadow_phys_bits. On MKTME and/or TDX capable systems, + * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems, * boot_cpu_data.x86_phys_bits holds the actual physical address - * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR - * reported by CPUID. Those bits between are KeyID bits. + * w/o the KeyID bits, and kvm_host.maxphyaddr equals to + * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits. */ - if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) + if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr) me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, - kvm_get_shadow_phys_bits() - 1); + kvm_host.maxphyaddr - 1); + /* * Unlike SME, host kernel doesn't support setting up any * MKTME KeyID on Intel platforms. No memory encryption @@ -8491,9 +8490,7 @@ static void __init vmx_setup_me_spte_mask(void) kvm_mmu_set_me_spte_mask(0, me_mask); } -static struct kvm_x86_init_ops vmx_init_ops __initdata; - -static __init int hardware_setup(void) +__init int vmx_hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; @@ -8507,10 +8504,6 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; - if (cpu_has_perf_global_ctrl_bug()) - pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8562,16 +8555,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - vmx_x86_ops.set_apic_access_page_addr = NULL; + vt_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - vmx_x86_ops.update_cr8_intercept = NULL; + vt_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; - vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; + vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; + vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; } #endif @@ -8586,7 +8579,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; if (!enable_apicv) - vmx_x86_ops.sync_pir_to_irr = NULL; + vt_x86_ops.sync_pir_to_irr = NULL; if (!enable_apicv || !cpu_has_vmx_ipiv()) enable_ipiv = false; @@ -8622,7 +8615,7 @@ static __init int hardware_setup(void) enable_pml = 0; if (!enable_pml) - vmx_x86_ops.cpu_dirty_log_size = 0; + vt_x86_ops.cpu_dirty_log_size = 0; if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8631,7 +8624,7 @@ static __init int hardware_setup(void) u64 use_timer_freq = 5000ULL * 1000 * 1000; cpu_preemption_timer_multi = - vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmx_misc_preemption_timer_rate(vmcs_config.misc); if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; @@ -8647,8 +8640,8 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - vmx_x86_ops.set_hv_timer = NULL; - vmx_x86_ops.cancel_hv_timer = NULL; + vt_x86_ops.set_hv_timer = NULL; + vt_x86_ops.cancel_hv_timer = NULL; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; @@ -8659,9 +8652,9 @@ static __init int hardware_setup(void) if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; if (pt_mode == PT_MODE_HOST_GUEST) - vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; + vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; else - vmx_init_ops.handle_intel_pt_intr = NULL; + vt_init_ops.handle_intel_pt_intr = NULL; setup_default_sgx_lepubkeyhash(); @@ -8684,14 +8677,6 @@ static __init int hardware_setup(void) return r; } -static struct kvm_x86_init_ops vmx_init_ops __initdata = { - .hardware_setup = hardware_setup, - .handle_intel_pt_intr = NULL, - - .runtime_ops = &vmx_x86_ops, - .pmu_ops = &intel_pmu_ops, -}; - static void vmx_cleanup_l1d_flush(void) { if (vmx_l1d_flush_pages) { @@ -8706,17 +8691,15 @@ static void __vmx_exit(void) { allow_smaller_maxphyaddr = false; - cpu_emergency_unregister_virt_callback(vmx_emergency_disable); - vmx_cleanup_l1d_flush(); } -static void vmx_exit(void) +static void __exit vmx_exit(void) { kvm_exit(); + __vmx_exit(); kvm_x86_vendor_exit(); - __vmx_exit(); } module_exit(vmx_exit); @@ -8733,7 +8716,7 @@ static int __init vmx_init(void) */ hv_init_evmcs(); - r = kvm_x86_vendor_init(&vmx_init_ops); + r = kvm_x86_vendor_init(&vt_init_ops); if (r) return r; @@ -8754,8 +8737,6 @@ static int __init vmx_init(void) pi_init_cpu(cpu); } - cpu_emergency_register_virt_callback(vmx_emergency_disable); - vmx_check_vmcs12_offsets(); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 65786dbe7d60..951e44dc9d0e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -7,18 +7,15 @@ #include <asm/kvm.h> #include <asm/intel_pt.h> #include <asm/perf_event.h> +#include <asm/posted_intr.h> #include "capabilities.h" #include "../kvm_cache_regs.h" -#include "posted_intr.h" #include "vmcs.h" #include "vmx_ops.h" #include "../cpuid.h" #include "run_flags.h" - -#define MSR_TYPE_R 1 -#define MSR_TYPE_W 2 -#define MSR_TYPE_RW 3 +#include "../mmu.h" #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) @@ -109,6 +106,8 @@ struct lbr_desc { bool msr_passthrough; }; +extern struct x86_pmu_lbr vmx_lbr_caps; + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -177,6 +176,7 @@ struct nested_vmx { bool reload_vmcs01_apic_access_page; bool update_vmcs01_cpu_dirty_logging; bool update_vmcs01_apicv_status; + bool update_vmcs01_hwapic_isr; /* * Enlightened VMCS has been enabled. It does not mean that L1 has to @@ -201,8 +201,6 @@ struct nested_vmx { struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; - struct kvm_host_map msr_bitmap_map; - struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; @@ -333,14 +331,15 @@ struct vcpu_vmx { bool ple_window_dirty; /* Support for PML */ -#define PML_ENTITY_NUM 512 +#define PML_LOG_NR_ENTRIES 512 + /* PML is written backwards: this is the first entry written by the CPU */ +#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1) + struct page *pml_pg; /* apic deadline value in host tsc */ u64 hv_deadline_tsc; - unsigned long host_debugctlmsr; - /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included @@ -362,6 +361,9 @@ struct vcpu_vmx { DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS); DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS); } shadow_msr_intercept; + + /* ve_info must be page aligned. */ + struct vmx_ve_information *ve_info; }; struct kvm_vmx { @@ -383,6 +385,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, unsigned long fs_base, unsigned long gs_base); int vmx_get_cpl(struct kvm_vcpu *vcpu); +int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu); bool vmx_emulation_required(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -400,6 +403,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); +bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); @@ -574,7 +578,8 @@ static inline u8 vmx_get_rvi(void) SECONDARY_EXEC_ENABLE_VMFUNC | \ SECONDARY_EXEC_BUS_LOCK_DETECTION | \ SECONDARY_EXEC_NOTIFY_VM_EXITING | \ - SECONDARY_EXEC_ENCLS_EXITING) + SECONDARY_EXEC_ENCLS_EXITING | \ + SECONDARY_EXEC_EPT_VIOLATION_VE) #define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 #define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ @@ -719,7 +724,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) if (!enable_ept) return true; - return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + return allow_smaller_maxphyaddr && + cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr; } static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu) @@ -747,4 +753,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } +static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h index eb48153bfd73..cdf8cbb69209 100644 --- a/arch/x86/kvm/vmx/vmx_onhyperv.h +++ b/arch/x86/kvm/vmx/vmx_onhyperv.h @@ -3,7 +3,7 @@ #ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__ #define __ARCH_X86_KVM_VMX_ONHYPERV_H__ -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> #include <asm/mshyperv.h> #include <linux/jump_label.h> @@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr) struct hv_vp_assist_page *vp_ap = hv_get_vp_assist_page(smp_processor_id()); + /* + * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page() + * and aborts enabling the feature otherwise. CPU onlining path is also checked in + * vmx_hardware_enable(). + */ + if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm)) + return; + if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall) vp_ap->nested_control.features.directhypercall = 1; vp_ap->current_nested_vmcs = phys_addr; diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 8060e5fc6dbd..96677576c836 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -15,7 +15,7 @@ void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); void invvpid_error(unsigned long ext, u16 vpid, gva_t gva); -void invept_error(unsigned long ext, u64 eptp, gpa_t gpa); +void invept_error(unsigned long ext, u64 eptp); #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* @@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field) BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001, "16-bit accessor invalid for 64-bit high field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000, - "16-bit accessor invalid for 32-bit high field"); + "16-bit accessor invalid for 32-bit field"); BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000, "16-bit accessor invalid for natural width field"); } @@ -118,7 +118,7 @@ do_exception: #else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ - asm volatile("1: vmread %2, %1\n\t" + asm volatile("1: vmread %[field], %[output]\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" @@ -127,24 +127,26 @@ do_exception: * @field, and bounce through the trampoline to preserve * volatile registers. */ - "xorl %k1, %k1\n\t" + "xorl %k[output], %k[output]\n\t" "2:\n\t" - "push %1\n\t" - "push %2\n\t" + "push %[output]\n\t" + "push %[field]\n\t" "call vmread_error_trampoline\n\t" /* * Unwind the stack. Note, the trampoline zeros out the * memory for @fault so that the result is '0' on error. */ - "pop %2\n\t" - "pop %1\n\t" + "pop %[field]\n\t" + "pop %[output]\n\t" "3:\n\t" /* VMREAD faulted. As above, except push '1' for @fault. */ - _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1) + _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %[output]) - : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc"); + : ASM_CALL_CONSTRAINT, [output] "=&r" (value) + : [field] "r" (field) + : "cc"); return value; #endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ @@ -312,13 +314,13 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva) vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva); } -static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa) +static inline void __invept(unsigned long ext, u64 eptp) { struct { - u64 eptp, gpa; - } operand = {eptp, gpa}; - - vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa); + u64 eptp; + u64 reserved_0; + } operand = { eptp, 0 }; + vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp); } static inline void vpid_sync_vcpu_single(int vpid) @@ -355,13 +357,13 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr) static inline void ept_sync_global(void) { - __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); + __invept(VMX_EPT_EXTENT_GLOBAL, 0); } static inline void ept_sync_context(u64 eptp) { if (cpu_has_vmx_invept_context()) - __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); + __invept(VMX_EPT_EXTENT_CONTEXT, eptp); else ept_sync_global(); } diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h new file mode 100644 index 000000000000..430773a5ef8e --- /dev/null +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_X86_OPS_H +#define __KVM_X86_VMX_X86_OPS_H + +#include <linux/kvm_host.h> + +#include "x86.h" + +__init int vmx_hardware_setup(void); + +extern struct kvm_x86_ops vt_x86_ops __initdata; +extern struct kvm_x86_init_ops vt_init_ops __initdata; + +void vmx_hardware_unsetup(void); +int vmx_check_processor_compat(void); +int vmx_enable_virtualization_cpu(void); +void vmx_disable_virtualization_cpu(void); +void vmx_emergency_disable_virtualization_cpu(void); +int vmx_vm_init(struct kvm *kvm); +void vmx_vm_destroy(struct kvm *kvm); +int vmx_vcpu_precreate(struct kvm *kvm); +int vmx_vcpu_create(struct kvm_vcpu *vcpu); +int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu); +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit); +void vmx_vcpu_free(struct kvm_vcpu *vcpu); +void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); +void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void vmx_vcpu_put(struct kvm_vcpu *vcpu); +int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath); +void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu); +int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu); +void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu); +int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +#ifdef CONFIG_KVM_SMM +int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram); +int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram); +void vmx_enable_smi_window(struct kvm_vcpu *vcpu); +#endif +int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len); +int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage, + struct x86_exception *exception); +bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu); +void vmx_migrate_timers(struct kvm_vcpu *vcpu); +void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); +void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu); +void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); +int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu); +void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); +void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); +bool vmx_has_emulated_msr(struct kvm *kvm, u32 index); +void vmx_msr_filter_changed(struct kvm_vcpu *vcpu); +void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu); +void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); +int vmx_get_feature_msr(u32 msr, u64 *data); +int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg); +void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +int vmx_get_cpl(struct kvm_vcpu *vcpu); +void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); +void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); +void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); +void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); +void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); +void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); +unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu); +void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); +bool vmx_get_if_flag(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_all(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_current(struct kvm_vcpu *vcpu); +void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr); +void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu); +void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); +u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); +void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall); +void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected); +void vmx_inject_nmi(struct kvm_vcpu *vcpu); +void vmx_inject_exception(struct kvm_vcpu *vcpu); +void vmx_cancel_injection(struct kvm_vcpu *vcpu); +int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection); +int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection); +bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); +void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked); +void vmx_enable_nmi_window(struct kvm_vcpu *vcpu); +void vmx_enable_irq_window(struct kvm_vcpu *vcpu); +void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr); +void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu); +void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); +void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); +int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr); +u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + +void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code); +void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code); + +u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu); +u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_write_tsc_offset(struct kvm_vcpu *vcpu); +void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu); +void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +#ifdef CONFIG_X86_64 +int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, + bool *expired); +void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu); +#endif +void vmx_setup_mce(struct kvm_vcpu *vcpu); + +#endif /* __KVM_X86_VMX_X86_OPS_H */ |