diff options
Diffstat (limited to 'arch/x86/kvm/vmx/nested.c')
| -rw-r--r-- | arch/x86/kvm/vmx/nested.c | 1372 |
1 files changed, 923 insertions, 449 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0c62352dda6a..40777278eabb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1,27 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/objtool.h> #include <linux/percpu.h> #include <asm/debugreg.h> #include <asm/mmu_context.h> +#include <asm/msr.h> +#include "x86.h" #include "cpuid.h" -#include "evmcs.h" #include "hyperv.h" #include "mmu.h" #include "nested.h" #include "pmu.h" +#include "posted_intr.h" #include "sgx.h" #include "trace.h" #include "vmx.h" -#include "x86.h" +#include "smm.h" static bool __read_mostly enable_shadow_vmcs = 1; module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); -static bool __read_mostly nested_early_check = 0; -module_param(nested_early_check, bool, S_IRUGO); +static bool __ro_after_init warn_on_missed_cc; +module_param(warn_on_missed_cc, bool, 0444); #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK @@ -178,7 +181,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all * fields and thus must be synced. */ - if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; return kvm_skip_emulated_instruction(vcpu); @@ -193,7 +196,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * can't be done if there isn't a current VMCS. */ if (vmx->nested.current_vmptr == INVALID_GPA && - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + !nested_vmx_is_evmptr12_valid(vmx)) return nested_vmx_failInvalid(vcpu); return nested_vmx_failValid(vcpu, vm_instruction_error); @@ -203,7 +206,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) @@ -225,14 +228,47 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs = NULL; + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); + vmx->nested.hv_evmcs = NULL; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + if (hv_vcpu) { + hv_vcpu->nested.pa_page_gpa = INVALID_GPA; + hv_vcpu->nested.vm_id = 0; + hv_vcpu->nested.vp_id = 0; } +#endif +} - vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; +static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) +{ +#ifdef CONFIG_KVM_HYPERV + struct vcpu_vmx *vmx = to_vmx(vcpu); + /* + * When Enlightened VMEntry is enabled on the calling CPU we treat + * memory area pointer by vmptr as Enlightened VMCS (as there's no good + * way to distinguish it from VMCS12) and we must not corrupt it by + * writing to the non-existent 'launch_state' field. The area doesn't + * have to be the currently active EVMCS on the calling CPU and there's + * nothing KVM has to do to transition it from 'active' to 'non-active' + * state. It is possible that the area will stay mapped as + * vmx->nested.hv_evmcs but this shouldn't be a problem. + */ + if (!guest_cpu_cap_has_evmcs(vcpu) || + !evmptr_is_valid(nested_get_evmptr(vcpu))) + return false; + + if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) + nested_release_evmcs(vcpu); + + return true; +#else + return false; +#endif } static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, @@ -240,7 +276,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, { struct vmcs_host_state *dest, *src; - if (unlikely(!vmx->guest_state_loaded)) + if (unlikely(!vmx->vt.guest_state_loaded)) return; src = &prev->host_state; @@ -266,7 +302,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) cpu = get_cpu(); prev = vmx->loaded_vmcs; vmx->loaded_vmcs = vmcs; - vmx_vcpu_load_vmcs(vcpu, cpu, prev); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_sync_vmcs_host_state(vmx, prev); put_cpu(); @@ -279,6 +315,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vcpu->arch.regs_dirty = 0; } +static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); + vmx->nested.pi_desc = NULL; +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -311,15 +357,8 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + + nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -350,6 +389,7 @@ static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, gpa_t addr) { + unsigned long roots = 0; uint i; struct kvm_mmu_root_info *cached_root; @@ -360,8 +400,10 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, eptp)) - vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + roots |= KVM_MMU_ROOT_PREVIOUS(i); } + if (roots) + kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); } static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, @@ -369,18 +411,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qualification; u32 vm_exit_reason; - unsigned long exit_qualification = vcpu->arch.exit_qualification; if (vmx->nested.pml_full) { vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; - exit_qualification &= INTR_INFO_UNBLOCK_NMI; + + /* + * It should be impossible to trigger a nested PML Full VM-Exit + * for anything other than an EPT Violation from L2. KVM *can* + * trigger nEPT page fault injection in response to an EPT + * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT + * tables also changed, but KVM should not treat EPT Misconfig + * VM-Exits as writes. + */ + WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); + + /* + * PML Full and EPT Violation VM-Exits both use bit 12 to report + * "NMI unblocking due to IRET", i.e. the bit can be propagated + * as-is from the original EXIT_QUALIFICATION. + */ + exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; } else { - if (fault->error_code & PFERR_RSVD_MASK) + if (fault->error_code & PFERR_RSVD_MASK) { vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else + exit_qualification = 0; + } else { + exit_qualification = fault->exit_qualification; + exit_qualification |= vmx_get_exit_qual(vcpu) & + (EPT_VIOLATION_GVA_IS_VALID | + EPT_VIOLATION_GVA_TRANSLATED); vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } /* * Although the caller (kvm_inject_emulated_page_fault) would @@ -491,6 +555,9 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) return -EINVAL; + if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) + return -EINVAL; + return 0; } @@ -561,8 +628,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; - struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -577,15 +643,18 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature * and tells KVM (L0) there were no changes in MSR bitmap for L2. */ - if (!vmx->nested.force_msr_bitmap_recalc && evmcs && - evmcs->hv_enlightenments_control.msr_bitmap && - evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) - return true; + if (!vmx->nested.force_msr_bitmap_recalc) { + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && + evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) + return true; + } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)map->hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -646,7 +715,34 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PRED_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_APERF, MSR_TYPE_R); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_MPERF, MSR_TYPE_R); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_U_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_S_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL0_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL1_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL2_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL3_SSP, MSR_TYPE_RW); + + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -668,7 +764,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -687,7 +783,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer, VMCS12_SIZE)) return; - kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), + kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), VMCS12_SIZE); } @@ -756,12 +852,30 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, return 0; } +static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, + vmx->nested.msrs.misc_high); + + return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; +} + static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, u32 count, u64 addr) { if (count == 0) return 0; + /* + * Exceeding the limit results in architecturally _undefined_ behavior, + * i.e. KVM is allowed to do literally anything in response to a bad + * limit. Immediately generate a consistency check so that code that + * consumes the count doesn't need to worry about extreme edge cases. + */ + if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) + return -EINVAL; + if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) return -EINVAL; @@ -872,15 +986,6 @@ static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, return 0; } -static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, - vmx->nested.msrs.misc_high); - - return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; -} - /* * Load guest's/host's msr at nested entry/exit. * return 0 for success, entry index for failure. @@ -897,7 +1002,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) goto fail; if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), @@ -913,7 +1018,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); goto fail; } - if (kvm_set_msr(vcpu, e.index, e.value)) { + if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); @@ -949,7 +1054,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, } } - if (kvm_get_msr(vcpu, msr_index, data)) { + if (kvm_emulate_msr_read(vcpu, msr_index, data)) { pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, msr_index); return false; @@ -985,7 +1090,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); for (i = 0; i < count; i++) { - if (unlikely(i >= max_msr_list_size)) + if (WARN_ON_ONCE(i >= max_msr_list_size)) return -EINVAL; if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) @@ -1044,9 +1149,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, /* * Emulated VMEntry does not fail here. Instead a less * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() - * instead of reading the value from the vmcs02 VMExit - * MSR-store area. + * nested_vmx_get_vmexit_msr_value() by reading KVM's + * internal MSR state instead of reading the value from + * the vmcs02 VMExit MSR-store area. */ pr_warn_ratelimited( "Not enough msr entries in msr_autostore. Can't add msr %x\n", @@ -1071,7 +1176,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, bool reload_pdptrs, enum vm_entry_failure_code *entry_failure_code) { - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { + if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -1125,12 +1230,18 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* Handle pending Hyper-V TLB flush requests */ + kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); + /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -1180,21 +1291,33 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved = - /* feature (except bit 48; see below) */ - BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | - /* reserved */ - BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | + VMX_BASIC_INOUT | + VMX_BASIC_TRUE_CTLS | + VMX_BASIC_NO_HW_ERROR_CODE_CC; + + const u64 reserved_bits = GENMASK_ULL(63, 57) | + GENMASK_ULL(47, 45) | + BIT_ULL(31); + u64 vmx_basic = vmcs_config.nested.basic; - if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has + * inverted polarity), the incoming value must not set feature bits or + * reserved bits that aren't allowed/supported by KVM. Fields, i.e. + * multi-bit values, are explicitly checked below. + */ + if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) return -EINVAL; /* * KVM does not emulate a version of VMX that constrains physical * addresses of VMX structures (e.g. VMCS) to 32-bits. */ - if (data & BIT_ULL(48)) + if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) return -EINVAL; if (vmx_basic_vmcs_revision_id(vmx_basic) != @@ -1263,16 +1386,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) { - const u64 feature_and_reserved_bits = - /* feature */ - BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | - BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | - /* reserved */ - GENMASK_ULL(13, 9) | BIT_ULL(31); + const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | + VMX_MISC_ACTIVITY_HLT | + VMX_MISC_ACTIVITY_SHUTDOWN | + VMX_MISC_ACTIVITY_WAIT_SIPI | + VMX_MISC_INTEL_PT | + VMX_MISC_RDMSR_IN_SMM | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMXOFF_BLOCK_SMI | + VMX_MISC_ZERO_LEN_INS; + + const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); + u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, vmcs_config.nested.misc_high); - if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + BUILD_BUG_ON(feature_bits & reserved_bits); + + /* + * The incoming value must not set feature bits or reserved bits that + * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are + * explicitly checked below. + */ + if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) return -EINVAL; if ((vmx->nested.msrs.pinbased_ctls_high & @@ -1555,14 +1691,23 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ vmcs12->tpr_threshold = evmcs->tpr_threshold; vmcs12->guest_rip = evmcs->guest_rip; if (unlikely(!(hv_clean_fields & + HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { + hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; + hv_vcpu->nested.vm_id = evmcs->hv_vm_id; + hv_vcpu->nested.vp_id = evmcs->hv_vp_id; + } + + if (unlikely(!(hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { vmcs12->guest_rsp = evmcs->guest_rsp; vmcs12->guest_rflags = evmcs->guest_rflags; @@ -1787,12 +1932,16 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields */ return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) { +#ifdef CONFIG_KVM_HYPERV struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; - struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); /* * Should not be changed by KVM: @@ -1961,6 +2110,9 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; return; +#else /* CONFIG_KVM_HYPERV */ + KVM_BUG_ON(1, vmx->vcpu.kvm); +#endif /* CONFIG_KVM_HYPERV */ } /* @@ -1970,14 +2122,16 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( struct kvm_vcpu *vcpu, bool from_launch) { +#ifdef CONFIG_KVM_HYPERV struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; - if (likely(!guest_cpuid_has_evmcs(vcpu))) + if (likely(!guest_cpu_cap_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; - if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { + evmcs_gpa = nested_get_evmptr(vcpu); + if (!evmptr_is_valid(evmcs_gpa)) { nested_release_evmcs(vcpu); return EVMPTRLD_DISABLED; } @@ -2050,13 +2204,16 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } return EVMPTRLD_SUCCEEDED; +#else + return EVMPTRLD_DISABLED; +#endif } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) copy_vmcs12_to_enlightened(vmx); else copy_vmcs12_to_shadow(vmx); @@ -2142,14 +2299,8 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) return; vmx->nested.vmcs02_initialized = true; - /* - * We don't care what the EPTP value is we just need to guarantee - * it's valid so we don't get a false positive when doing early - * consistency checks. - */ - if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, - construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); + if (vmx->ve_info) + vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) @@ -2196,6 +2347,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -2210,7 +2372,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) prepare_vmcs02_early_rare(vmx, vmcs12); /* @@ -2222,10 +2384,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 /* Posted interrupts setting is only taken from vmcs12. */ vmx->nested.pi_pending = false; - if (nested_cpu_has_posted_intr(vmcs12)) + if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - else + } else { + vmx->nested.posted_intr_nv = -1; exec_control &= ~PIN_BASED_POSTED_INTR; + } pin_controls_set(vmx, exec_control); /* @@ -2275,7 +2439,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_XSAVES | + SECONDARY_EXEC_ENABLE_XSAVES | SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2296,8 +2460,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit. */ - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && - (vmcs12->guest_cr4 & X86_CR4_UMIP)) + if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) exec_control |= SECONDARY_EXEC_DESC; if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) @@ -2332,7 +2495,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; - if (guest_efer != host_efer) + if (guest_efer != kvm_host.efer) exec_control |= VM_ENTRY_LOAD_IA32_EFER; } vm_entry_controls_set(vmx, exec_control); @@ -2345,7 +2508,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ exec_control = __vm_exit_controls_get(vmcs01); - if (cpu_has_load_ia32_efer() && guest_efer != host_efer) + if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; else exec_control &= ~VM_EXIT_LOAD_IA32_EFER; @@ -2370,12 +2533,39 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 } } +static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, + u64 *ssp, u64 *ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + *s_cet = vmcs_readl(GUEST_S_CET); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + *ssp = vmcs_readl(GUEST_SSP); + *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); + } +} + +static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, s_cet); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, ssp); + vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); + } +} + static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; + struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { + vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); @@ -2413,7 +2603,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmx->segment_cache.bitmask = 0; + vmx_segment_cache_clear(vmx); } if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & @@ -2485,6 +2675,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); + set_cr4_guest_host_mask(vmx); } @@ -2504,25 +2698,33 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); bool load_guest_pdptrs_vmcs12 = false; - if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { prepare_vmcs02_rare(vmx, vmcs12); vmx->nested.dirty_vmcs12 = false; - load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || - !(vmx->nested.hv_evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); + load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || + !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & + vmx_get_supported_debugctl(vcpu, false)); } else { kvm_set_dr(vcpu, 7, vcpu->arch.dr7); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); + vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, + vmx->nested.pre_vmenter_ssp, + vmx->nested.pre_vmenter_ssp_tbl); + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); @@ -2541,7 +2743,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); + vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); } vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( @@ -2563,12 +2765,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_ept_init_mmu_context(vcpu); /* - * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those - * bits which we consider mandatory enabled. - * The CR0_READ_SHADOW is what L2 should have expected to read given - * the specifications by L1; It's not enough to take - * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we - * have more bits than L1 expected. + * Override the CR0/CR4 read shadows after setting the effective guest + * CR0/CR4. The common helpers also set the shadows, but they don't + * account for vmcs12's cr0/4_guest_host_mask. */ vmx_set_cr0(vcpu, vmcs12->guest_cr0); vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); @@ -2620,9 +2819,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->guest_ia32_perf_global_ctrl))) { + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; return -EINVAL; } @@ -2636,9 +2835,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * bits when it changes a field in eVMCS. Mark all fields as clean * here. */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) - vmx->nested.hv_evmcs->hv_clean_fields |= - HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; + if (nested_vmx_is_evmptr12_valid(vmx)) + evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; return 0; } @@ -2689,7 +2887,7 @@ static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) } /* Reserved bits should not be set */ - if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) + if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ @@ -2757,6 +2955,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, } } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && + CC(!vmcs12->tsc_multiplier)) + return -EINVAL; + return 0; } @@ -2801,7 +3003,6 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; @@ -2818,12 +3019,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (CC(has_error_code != should_have_error_code)) - return -EINVAL; + /* + * Cannot deliver error code in real mode or if the interrupt + * type is not hardware exception. For other cases, do the + * consistency check only if the vCPU doesn't enumerate + * VMX_BASIC_NO_HW_ERROR_CODE_CC. + */ + if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { + if (CC(has_error_code)) + return -EINVAL; + } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { + if (CC(has_error_code != x86_exception_has_error_code(vector))) + return -EINVAL; + } /* VM-entry exception error code */ if (CC(has_error_code && @@ -2839,7 +3047,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -2860,8 +3068,42 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; - if (guest_cpuid_has_evmcs(vcpu)) +#ifdef CONFIG_KVM_HYPERV + if (guest_cpu_cap_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); +#endif + + return 0; +} + +static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; + u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; + + /* + * Don't bother with the consistency checks if KVM isn't configured to + * WARN on missed consistency checks, as KVM needs to rely on hardware + * to fully detect an illegal vTPR vs. TRP Threshold combination due to + * the vTPR being writable by L1 at all times (it's an in-memory value, + * not a VMCS field). I.e. even if the check passes now, it might fail + * at the actual VM-Enter. + * + * Keying off the module param also allows treating an invalid vAPIC + * mapping as a consistency check failure without increasing the risk + * of breaking a "real" VM. + */ + if (!warn_on_missed_cc) + return 0; + + if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && + nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && + !nested_cpu_has_vid(vmcs12) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && + (CC(!vapic) || + CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) + return -EINVAL; return 0; } @@ -2877,18 +3119,42 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, return 0; } +static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) +{ + /* + * Check that the given linear address is canonical after a VM exit + * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. + */ + u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; + + return !__is_canonical_address(la, l1_address_bits_on_exit); +} + +static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || + CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || - CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) + CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) + if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) + return -EINVAL; + + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && @@ -2900,12 +3166,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, vmcs12->host_ia32_perf_global_ctrl))) return -EINVAL; -#ifdef CONFIG_X86_64 - ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); -#else - ia32e = false; -#endif - if (ia32e) { if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) return -EINVAL; @@ -2928,12 +3188,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(vmcs12->host_ss_selector == 0 && !ia32e)) return -EINVAL; - if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || - CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) + if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || + CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || + CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) return -EINVAL; /* @@ -2949,6 +3209,27 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, + vmcs12->host_ssp, + vmcs12->host_ssp_tbl)) + return -EINVAL; + + /* + * IA32_S_CET and SSP must be canonical if the host will + * enter 64-bit mode after VM-exit; otherwise, higher + * 32-bits must be all 0s. + */ + if (ia32e) { + if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) + return -EINVAL; + } else { + if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) + return -EINVAL; + } + } + return 0; } @@ -2999,7 +3280,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, enum vm_entry_failure_code *entry_failure_code) { - bool ia32e; + bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -3007,8 +3288,12 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && - CC(!kvm_dr7_valid(vmcs12->guest_dr7))) + (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || + CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) return -EINVAL; if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && @@ -3025,6 +3310,13 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, vmcs12->guest_ia32_perf_global_ctrl))) return -EINVAL; + if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) + return -EINVAL; + + if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || + CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -3036,7 +3328,6 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, */ if (to_vmx(vcpu)->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { - ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || CC(((vmcs12->guest_cr0 & X86_CR0_PG) && @@ -3045,94 +3336,34 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && - (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || + (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; - if (nested_check_guest_non_reg_state(vmcs12)) - return -EINVAL; - - return 0; -} - -static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long cr3, cr4; - bool vm_fail; - - if (!nested_early_check) - return 0; - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); - - preempt_disable(); - - vmx_prepare_switch_to_guest(vcpu); - - /* - * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, - * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. - * there is no need to preserve other bits or save/restore the field. - */ - vmcs_writel(GUEST_RFLAGS, 0); - - cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { - vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->host_state.cr3 = cr3; - } - - cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { - vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->host_state.cr4 = cr4; - } - - vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - __vmx_vcpu_run_flags(vmx)); - - if (vmx->msr_autoload.host.nr) - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); - if (vmx->msr_autoload.guest.nr) - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); - - if (vm_fail) { - u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); - - preempt_enable(); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl)) + return -EINVAL; - trace_kvm_nested_vmenter_failed( - "early hardware check VM-instruction error: ", error); - WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + /* + * Guest SSP must have 63:N bits identical, rather than + * be canonical (i.e., 63:N-1 bits identical), where N is + * the CPU's maximum linear-address width. Similar to + * is_noncanonical_msr_address(), use the host's + * linear-address width. + */ + if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) + return -EINVAL; } - /* - * VMExit clears RFLAGS.IF and DR7, even on a consistency check. - */ - if (hw_breakpoint_active()) - set_debugreg(__this_cpu_read(cpu_dr7), 7); - local_irq_enable(); - preempt_enable(); - - /* - * A non-failing VMEntry means we somehow entered guest mode with - * an illegal RIP, and that's just the tip of the iceberg. There - * is no telling what memory has been modified or what state has - * been exposed to unknown code. Hitting this all but guarantees - * a (very critical) hardware issue. - */ - WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & - VMX_EXIT_REASONS_FAILED_VMENTRY)); + if (nested_check_guest_non_reg_state(vmcs12)) + return -EINVAL; return 0; } +#ifdef CONFIG_KVM_HYPERV static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3142,7 +3373,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (guest_cpuid_has_evmcs(vcpu) && + if (guest_cpu_cap_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3160,6 +3391,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) return true; } +#endif static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { @@ -3251,6 +3483,13 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) { +#ifdef CONFIG_KVM_HYPERV + /* + * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy + * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory + * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post + * migration. + */ if (!nested_get_evmcs_page(vcpu)) { pr_debug_ratelimited("%s: enlightened vmptrld failed\n", __func__); @@ -3261,6 +3500,7 @@ static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) return false; } +#endif if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) return false; @@ -3288,7 +3528,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) if (!nested_cpu_has_pml(vmcs12)) return 0; - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { vmx->nested.pml_full = true; return 1; } @@ -3327,14 +3567,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) return 1; } -static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) -{ - u8 rvi = vmx_get_rvi(); - u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); - - return ((rvi & 0xf0) > (vppr & 0xf0)); -} - static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); @@ -3354,7 +3586,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); enum vm_entry_failure_code entry_failure_code; - bool evaluate_pending_interrupts; union vmx_exit_reason exit_reason = { .basic = EXIT_REASON_INVALID_STATE, .failed_vmentry = 1, @@ -3373,38 +3604,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, kvm_service_local_tlb_flush_requests(vcpu); - evaluate_pending_interrupts = exec_controls_get(vmx) & - (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); - if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) - evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); - if (!evaluate_pending_interrupts) - evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); - if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) - vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, + &vmx->nested.pre_vmenter_ssp, + &vmx->nested.pre_vmenter_ssp_tbl); + /* - * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* - * nested early checks are disabled. In the event of a "late" VM-Fail, - * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its - * software model to the pre-VMEntry host state. When EPT is disabled, - * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes - * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing - * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to - * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested - * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is - * guaranteed to be overwritten with a shadow CR3 prior to re-entering - * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as - * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks - * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail - * path would need to manually save/restore vmcs01.GUEST_CR3. + * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the + * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but + * not KVM, KVM must unwind its software model to the pre-VM-Entry host + * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not + * L1's "real" CR3, which causes nested_vmx_restore_host_state() to + * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the + * unwind naturally setting arch.cr3 to the correct value. Smashing + * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, + * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be + * overwritten with a shadow CR3 prior to re-entering L1. */ - if (!enable_ept && !nested_early_check) + if (!enable_ept) vmcs_writel(GUEST_CR3, vcpu->arch.cr3); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); @@ -3417,7 +3643,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_KVM_INTERNAL_ERROR; } - if (nested_vmx_check_vmentry_hw(vcpu)) { + if (nested_vmx_check_controls_late(vcpu, vmcs12)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); return NVMX_VMENTRY_VMFAIL; } @@ -3462,9 +3688,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit - * unconditionally. + * unconditionally. Take care to pull data from vmcs01 as appropriate, + * e.g. when checking for interrupt windows, as vmcs02 is now loaded. */ - if (unlikely(evaluate_pending_interrupts)) + if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING)) || + kvm_apic_has_pending_init_or_sipi(vcpu) || + kvm_apic_has_interrupt(vcpu)) kvm_make_request(KVM_REQ_EVENT, vcpu); /* @@ -3504,7 +3734,7 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason.full; - if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; } @@ -3530,12 +3760,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + kvm_pmu_branch_retired(vcpu); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); - if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && + if (CC(!nested_vmx_is_evmptr12_valid(vmx) && vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); @@ -3550,8 +3780,10 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (CC(vmcs12->hdr.shadow_vmcs)) return nested_vmx_failInvalid(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { - copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); + if (nested_vmx_is_evmptr12_valid(vmx)) { + struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); + + copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); /* Enlightened VMCS doesn't have launch state */ vmcs12->launch_state = !launch; } else if (enable_shadow_vmcs) { @@ -3595,16 +3827,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; - /* Emulate processing of posted interrupts on VM-Enter. */ - if (nested_cpu_has_posted_intr(vmcs12) && - kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { - vmx->nested.pi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); - } - /* Hide L1D cache contents from the nested guest. */ - vmx->vcpu.arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); /* * Must happen outside of nested_vmx_enter_non_root_mode() as it will @@ -3635,7 +3859,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -3796,8 +4020,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) if (!pi_test_and_clear_on(vmx->nested.pi_desc)) return 0; - max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); - if (max_irr != 256) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0) { vapic_page = vmx->nested.virtual_apic_map.hva; if (!vapic_page) goto mmio_needed; @@ -3839,7 +4063,12 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) exit_qual = 0; } - if (ex->has_error_code) { + /* + * Unlike AMD's Paged Real Mode, which reports an error code on #PF + * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the + * "has error code" flags on VM-Exit if the CPU is in Real Mode. + */ + if (ex->has_error_code && is_protmode(vcpu)) { /* * Intel CPUs do not generate error codes with bits 31:16 set, * and more importantly VMX disallows setting bits 31:16 in the @@ -3923,10 +4152,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } -static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) { - return nested_vmx_preemption_timer_pending(vcpu) || - to_vmx(vcpu)->nested.mtf_pending; + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic = vmx->nested.virtual_apic_map.hva; + int max_irr, vppr; + + if (nested_vmx_preemption_timer_pending(vcpu) || + vmx->nested.mtf_pending) + return true; + + /* + * Virtual Interrupt Delivery doesn't require manual injection. Either + * the interrupt is already in GUEST_RVI and will be recognized by CPU + * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move + * the interrupt from the PIR to RVI prior to entering the guest. + */ + if (for_injection) + return false; + + if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || + __vmx_interrupt_blocked(vcpu)) + return false; + + if (!vapic) + return false; + + vppr = *((u32 *)(vapic + APIC_PROCPRI)); + + max_irr = vmx_get_rvi(); + if ((max_irr & 0xf0) > (vppr & 0xf0)) + return true; + + if (vmx->nested.pi_pending && vmx->nested.pi_desc && + pi_test_on(vmx->nested.pi_desc)) { + max_irr = pi_find_highest_vector(vmx->nested.pi_desc); + if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) + return true; + } + + return false; } /* @@ -4023,13 +4288,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) */ bool block_nested_exceptions = vmx->nested.nested_run_pending; /* - * New events (not exceptions) are only recognized at instruction + * Events that don't require injection, i.e. that are virtualized by + * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need + * to regain control in order to deliver the event, and hardware will + * handle event ordering, e.g. with respect to injected exceptions. + * + * But, new events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a - * VM-Exit that occurred _during_ instruction execution; new events are - * blocked until the instruction completes. + * VM-Exit that occurred _during_ instruction execution; new events, + * irrespective of whether or not they're injected, are blocked until + * the instruction completes. + */ + bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); + /* + * Inject events are blocked by nested VM-Enter, as KVM is responsible + * for managing priority between concurrent events, i.e. KVM needs to + * wait until after VM-Enter completes to deliver injected events. */ bool block_nested_events = block_nested_exceptions || - kvm_event_needs_reinjection(vcpu); + block_non_injected_events; if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -4139,11 +4416,71 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) } if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { + int irq; + + if (!nested_exit_on_intr(vcpu)) { + if (block_nested_events) + return -EBUSY; + + goto no_vmexit; + } + + if (!nested_exit_intr_ack_set(vcpu)) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + return 0; + } + + irq = kvm_cpu_get_extint(vcpu); + if (irq != -1) { + if (block_nested_events) + return -EBUSY; + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + return 0; + } + + irq = kvm_apic_has_interrupt(vcpu); + if (WARN_ON_ONCE(irq < 0)) + goto no_vmexit; + + /* + * If the IRQ is L2's PI notification vector, process posted + * interrupts for L2 instead of injecting VM-Exit, as the + * detection/morphing architecturally occurs when the IRQ is + * delivered to the CPU. Note, only interrupts that are routed + * through the local APIC trigger posted interrupt processing, + * and enabling posted interrupts requires ACK-on-exit. + */ + if (irq == vmx->nested.posted_intr_nv) { + /* + * Nested posted interrupts are delivered via RVI, i.e. + * aren't injected by KVM, and so can be queued even if + * manual event injection is disallowed. + */ + if (block_non_injected_events) + return -EBUSY; + + vmx->nested.pi_pending = true; + kvm_apic_clear_irr(vcpu, irq); + goto no_vmexit; + } + if (block_nested_events) return -EBUSY; - if (!nested_exit_on_intr(vcpu)) - goto no_vmexit; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); + + nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); + + /* + * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must + * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI + * if APICv is active. + */ + kvm_apic_ack_interrupt(vcpu, irq); return 0; } @@ -4271,12 +4608,12 @@ static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, cpu = get_cpu(); vmx->loaded_vmcs = &vmx->nested.vmcs02; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); + vmx_vcpu_load_vmcs(vcpu, cpu); sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); + vmx_vcpu_load_vmcs(vcpu, cpu); put_cpu(); } @@ -4290,11 +4627,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = - !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); + !nested_vmx_is_evmptr12_valid(vmx); vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); @@ -4349,11 +4686,21 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + /* + * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. + * Writes to DEBUGCTL that aren't intercepted by L1 are immediately + * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into + * vmcs02 doesn't strictly track vmcs12. + */ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; + + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, + &vmcs12->guest_ssp, + &vmcs12->guest_ssp_tbl); } /* @@ -4369,11 +4716,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; - if (to_vmx(vcpu)->exit_reason.enclave_mode) + if (vmx_get_exit_reason(vcpu).enclave_mode) vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; vmcs12->exit_qualification = exit_qualification; @@ -4397,7 +4744,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4449,7 +4796,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * CR0_GUEST_HOST_MASK is already set in the original vmcs01 * (KVM doesn't change it); */ - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs12->host_cr0); /* Same as above - no reason to call set_cr4_guest_host_mask(). */ @@ -4479,14 +4826,26 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); + /* + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. + * otherwise CET state should be retained across VM-exit, i.e., + * guest values should be propagated from vmcs12 to vmcs01. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, + vmcs12->host_ssp_tbl); + else + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) - WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl)); + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -4539,13 +4898,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + vmx_guest_debugctl_write(vcpu, 0); if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); } static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) @@ -4557,7 +4916,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) return vmcs_read64(GUEST_IA32_EFER); if (cpu_has_load_ia32_efer()) - return host_efer; + return kvm_host.efer; for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) @@ -4568,7 +4927,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) if (efer_msr) return efer_msr->data; - return host_efer; + return kvm_host.efer; } static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) @@ -4594,13 +4953,16 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); } + /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ + vmx_reload_guest_debugctl(vcpu); + /* * Note that calling vmx_set_{efer,cr0,cr4} is important as they * handle a variety of side effects to KVM's software model. */ vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); - vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; + vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); @@ -4661,7 +5023,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) goto vmabort; } - if (kvm_set_msr(vcpu, h.index, h.value)) { + if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { pr_debug_ratelimited( "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", __func__, j, h.index, h.value); @@ -4681,8 +5043,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4693,6 +5056,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { /* * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map @@ -4702,6 +5066,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, */ (void)nested_get_evmcs_page(vcpu); } +#endif /* Service pending TLB flush requests for L2 before switching to L1. */ kvm_service_local_tlb_flush_requests(vcpu); @@ -4730,7 +5095,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -4746,12 +5112,13 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, /* * The only expected VM-instruction error is "VM entry with * invalid control field(s)." Anything else indicates a - * problem with L0. And we should never get here with a - * VMFail of any type if early consistency checks are enabled. + * problem with L0. */ WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != VMXERR_ENTRY_INVALID_CONTROL_FIELD); - WARN_ON_ONCE(nested_early_check); + + /* VM-Fail at VM-Entry means KVM missed a consistency check. */ + WARN_ON_ONCE(warn_on_missed_cc); } /* @@ -4767,6 +5134,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_switch_vmcs(vcpu, &vmx->vmcs01); + kvm_nested_vmexit_handle_ibrs(vcpu); + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -4787,11 +5156,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - /* Unpin physical memory we referred to in vmcs02 */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; @@ -4803,22 +5168,19 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } + if (vmx->nested.update_vmcs01_hwapic_isr) { + vmx->nested.update_vmcs01_hwapic_isr = false; + kvm_apic_update_hwapic_isr(vcpu); + } + if ((vm_exit_reason != -1) && - (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) + (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { - if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - nested_exit_intr_ack_set(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; - } - if (vm_exit_reason != -1) trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, vmcs12->exit_qualification, @@ -4829,6 +5191,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } @@ -4854,6 +5227,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) { + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); } @@ -4929,11 +5303,12 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, else *ret = off; + *ret = vmx_get_untagged_addr(vcpu, *ret, 0); /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory * destination for long mode! */ - exn = is_noncanonical_address(*ret, vcpu); + exn = is_noncanonical_address(*ret, vcpu, 0); } else { /* * When not in long mode, the virtual/linear address is @@ -5059,9 +5434,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) goto out_shadow_vmcs; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); - vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; + hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED); vmx->nested.vpid02 = allocate_vpid(); @@ -5099,24 +5473,35 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; /* - * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks - * that have higher priority than VM-Exit (see Intel SDM's pseudocode - * for VMXON), as KVM must load valid CR0/CR4 values into hardware while - * running the guest, i.e. KVM needs to check the _guest_ values. + * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter + * the guest and so cannot rely on hardware to perform the check, + * which has higher priority than VM-Exit (see Intel SDM's pseudocode + * for VMXON). * - * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and - * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real - * Mode, but KVM will never take the guest out of those modes. + * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 + * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't + * force any of the relevant guest state. For a restricted guest, KVM + * does force CR0.PE=1, but only to also force VM86 in order to emulate + * Real Mode, and so there's no need to check CR0.PE manually. */ - if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || - !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } /* - * CPL=0 and all other checks that are lower priority than VM-Exit must - * be checked manually. + * The CPL is checked for "not in VMX operation" and for "in VMX root", + * and has higher priority than the VM-Fail due to being post-VMXON, + * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, + * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits + * from L2 to L1, i.e. there's no need to check for the vCPU being in + * VMX non-root. + * + * Forwarding the VM-Exit unconditionally, i.e. without performing the + * #UD checks (see above), is functionally ok because KVM doesn't allow + * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's + * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are + * missed by hardware due to shadowing CR0 and/or CR4. */ if (vmx_get_cpl(vcpu)) { kvm_inject_gp(vcpu, 0); @@ -5126,6 +5511,17 @@ static int handle_vmxon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + /* + * Invalid CR0/CR4 generates #GP. These checks are performed if and + * only if the vCPU isn't already in VMX operation, i.e. effectively + * have lower priority than the VM-Fail above. + */ + if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || + !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { + kvm_inject_gp(vcpu, 0); + return 1; + } + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); @@ -5205,7 +5601,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 zero = 0; gpa_t vmptr; - u64 evmcs_gpa; int r; if (!nested_vmx_check_permission(vcpu)) @@ -5220,27 +5615,23 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.vmxon_ptr) return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - /* - * When Enlightened VMEntry is enabled on the calling CPU we treat - * memory area pointer by vmptr as Enlightened VMCS (as there's no good - * way to distinguish it from VMCS12) and we must not corrupt it by - * writing to the non-existent 'launch_state' field. The area doesn't - * have to be the currently active EVMCS on the calling CPU and there's - * nothing KVM has to do to transition it from 'active' to 'non-active' - * state. It is possible that the area will stay mapped as - * vmx->nested.hv_evmcs but this shouldn't be a problem. - */ - if (likely(!guest_cpuid_has_evmcs(vcpu) || - !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { + if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); - } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { - nested_release_evmcs(vcpu); + /* + * Silently ignore memory errors on VMCLEAR, Intel's pseudocode + * for VMCLEAR includes a "ensure that data for VMCS referenced + * by the operand is in memory" clause that guards writes to + * memory, i.e. doing nothing for I/O is architecturally valid. + * + * FIXME: Suppress failures if and only if no memslot is found, + * i.e. exit to userspace if __copy_to_user() fails. + */ + (void)kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); } return nested_vmx_succeed(vcpu); @@ -5279,7 +5670,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); - if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { + if (!nested_vmx_is_evmptr12_valid(vmx)) { /* * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. @@ -5317,7 +5708,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* Read the field, zero-extended to a u64 value */ - value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); + value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); } /* @@ -5505,7 +5896,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) return 1; if (vmx->nested.current_vmptr != vmptr) { @@ -5568,7 +5959,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) + if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) return 1; if (get_vmx_mem_address(vcpu, exit_qual, instr_info, @@ -5713,11 +6104,21 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if (!operand.vpid || - is_noncanonical_address(operand.gla, vcpu)) + is_noncanonical_invlpg_address(operand.gla, vcpu)) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); @@ -5793,11 +6194,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) u32 function = kvm_rax_read(vcpu); /* - * VMFUNC is only supported for nested guests, but we always enable the - * secondary control for simplicity; for non-nested mode, fake that we - * didn't by injecting #UD. + * VMFUNC should never execute cleanly while L1 is active; KVM supports + * VMFUNC for nested VMs, but not for L1. */ - if (!is_guest_mode(vcpu)) { + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -5832,7 +6232,7 @@ fail: * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode * EXIT_REASON_VMFUNC as the exit reason. */ - nested_vmx_vmexit(vcpu, vmx->exit_reason.full, + nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; @@ -5903,19 +6303,26 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, union vmx_exit_reason exit_reason) { - u32 msr_index = kvm_rcx_read(vcpu); + u32 msr_index; gpa_t bitmap; if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return true; + if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) + msr_index = vmx_get_exit_qual(vcpu); + else + msr_index = kvm_rcx_read(vcpu); + /* * The MSR_BITMAP page is divided into four 1024-byte bitmaps, * for the four combinations of read/write and low/high MSR numbers. * First we need to figure out which of the four to use: */ bitmap = vmcs12->msr_bitmap; - if (exit_reason.basic == EXIT_REASON_MSR_WRITE) + if (exit_reason.basic == EXIT_REASON_MSR_WRITE || + exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) bitmap += 2048; if (msr_index >= 0xc0000000) { msr_index -= 0xc0000000; @@ -6012,7 +6419,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, { u32 encls_leaf; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) return false; @@ -6090,6 +6497,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, else if (is_alignment_check(intr_info) && !vmx_guest_inject_ac(vcpu)) return true; + else if (is_ve_fault(intr_info)) + return true; return false; case EXIT_REASON_EXTERNAL_INTERRUPT: return true; @@ -6128,6 +6537,13 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, * Handle L2's bus locks in L0 directly. */ return true; +#ifdef CONFIG_KVM_HYPERV + case EXIT_REASON_VMCALL: + /* Hyper-V L2 TLB flush hypercall is handled by L0 */ + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_evmcs_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu); +#endif default: break; } @@ -6205,6 +6621,8 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: + case EXIT_REASON_MSR_READ_IMM: + case EXIT_REASON_MSR_WRITE_IMM: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); case EXIT_REASON_INVALID_STATE: return true; @@ -6239,14 +6657,17 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + case EXIT_REASON_XSAVES: + case EXIT_REASON_XRSTORS: /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. + * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize + * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap + * verbatim, i.e. any exit is due to L1's bitmap. WARN if + * XSAVES isn't enabled, as the CPU is supposed to inject #UD + * in that case, before consulting the XSS-bitmap. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); + WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); + return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, @@ -6256,6 +6677,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, case EXIT_REASON_NOTIFY: /* Notify VM exit is not exposed to L1 */ return false; + case EXIT_REASON_SEAMCALL: + case EXIT_REASON_TDCALL: + /* + * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't + * virtualized by KVM for L1 hypervisors, i.e. L1 should + * never want or expect such an exit. + */ + return false; default: return true; } @@ -6268,7 +6697,7 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx->vt.exit_reason; unsigned long exit_qual; u32 exit_intr_info; @@ -6341,7 +6770,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && + if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; @@ -6350,7 +6779,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ - if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) + if (nested_vmx_is_evmptr12_set(vmx)) kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && @@ -6406,7 +6835,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, } else { copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); if (!vmx->nested.need_vmcs12_to_shadow_sync) { - if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) + if (nested_vmx_is_evmptr12_valid(vmx)) /* * L1 hypervisor is not obliged to keep eVMCS * clean fields data always up-to-date while @@ -6440,9 +6869,6 @@ out: return kvm_state.size; } -/* - * Forcibly leave nested mode in order to be able to reset the VCPU later on. - */ void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { @@ -6479,13 +6905,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * code was changed such that flag signals vmcs12 should * be copied into eVMCS in guest memory. * - * To preserve backwards compatability, allow user + * To preserve backwards compatibility, allow user * to set this flag even when there is no VMXON region. */ if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) return -EINVAL; } else { - if (!nested_vmx_allowed(vcpu)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) return -EINVAL; if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) @@ -6519,7 +6945,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && - (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || + !vmx->nested.enlightened_vmcs_enabled)) return -EINVAL; vmx_leave_nested(vcpu); @@ -6549,6 +6976,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); +#ifdef CONFIG_KVM_HYPERV } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * nested_vmx_handle_enlightened_vmptrld() cannot be called @@ -6558,6 +6986,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, */ vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); +#endif } else { return -EINVAL; } @@ -6678,36 +7107,9 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; } -/* - * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be - * returned for the various VMX controls MSRs when nested VMX is enabled. - * The same values should also be used to verify that vmcs12 control fields are - * valid during nested entry from L1 to L2. - * Each of these control msrs has a low and high 32-bit half: A low bit is on - * if the corresponding bit in the (32-bit) control field *must* be on, and a - * bit in the high half is on if the corresponding bit in the control field - * may be on. See also vmx_control_verify(). - */ -void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) { - struct nested_vmx_msrs *msrs = &vmcs_conf->nested; - - /* - * Note that as a general rule, the high half of the MSRs (bits in - * the control fields which may be 1) should be initialized by the - * intersection of the underlying hardware's MSR (i.e., features which - * can be supported) and the list of features we want to expose - - * because they are known to be properly supported in our code. - * Also, usually, the low half of the MSRs (bits which must be 1) can - * be set to 0, meaning that L1 may turn off any of these bits. The - * reason is that if one of these bits is necessary, it will appear - * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control - * fields of vmcs01 and vmcs02, will turn these bits off - and - * nested_vmx_l1_wants_exit() will not pass related exits to L1. - * These rules have exceptions below. - */ - - /* pin-based controls */ msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6720,8 +7122,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) msrs->pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; +} - /* exit controls */ +static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6731,17 +7136,24 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; + /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; +} - /* entry controls */ +static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6750,15 +7162,23 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; + /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; +} - /* cpu-based controls */ +static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; @@ -6790,12 +7210,12 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) /* We support free control of CR3 access interception. */ msrs->procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); +} - /* - * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by - * vmx_vcpu_after_set_cpuid. - */ +static void nested_vmx_setup_secondary_ctls(u32 ept_caps, + struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; @@ -6808,9 +7228,11 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_RDRAND_EXITING | SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_ENABLE_VMFUNC | SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_ENABLE_XSAVES | + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; /* * We can emulate "VMCS shadowing," even if the hardware @@ -6839,18 +7261,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_PML; msrs->ept_caps |= VMX_EPT_AD_BIT; } - } - if (cpu_has_vmx_vmfunc()) { - msrs->secondary_ctls_high |= - SECONDARY_EXEC_ENABLE_VMFUNC; /* - * Advertise EPTP switching unconditionally - * since we emulate it + * Advertise EPTP switching irrespective of hardware support, + * KVM emulates it in software so long as VMFUNC is supported. */ - if (enable_ept) - msrs->vmfunc_controls = - VMX_VMFUNC_EPTP_SWITCHING; + if (cpu_has_vmx_vmfunc()) + msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; } /* @@ -6876,31 +7293,40 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) if (enable_sgx) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; +} - /* miscellaneous data */ +static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, + struct nested_vmx_msrs *msrs) +{ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= - MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | + VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT | VMX_MISC_ACTIVITY_WAIT_SIPI; msrs->misc_high = 0; +} +static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) +{ /* * This MSR reports some information about VMX support. We * should return information about the VMX we emulate for the * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - msrs->basic = - VMCS12_REVISION | - VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, + X86_MEMTYPE_WB); + msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; + if (cpu_has_vmx_basic_no_hw_errcode_cc()) + msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; +} +static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) +{ /* * These MSRs specify bits which the guest must keep fixed on * while L1 is in VMXON mode (in L1's root mode, or running an L2). @@ -6912,11 +7338,56 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; /* These MSRs specify bits which the guest must keep fixed off. */ - rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); - rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); + rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); + rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); if (vmx_umip_emulated()) msrs->cr4_fixed1 |= X86_CR4_UMIP; +} + +/* + * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be + * returned for the various VMX controls MSRs when nested VMX is enabled. + * The same values should also be used to verify that vmcs12 control fields are + * valid during nested entry from L1 to L2. + * Each of these control msrs has a low and high 32-bit half: A low bit is on + * if the corresponding bit in the (32-bit) control field *must* be on, and a + * bit in the high half is on if the corresponding bit in the control field + * may be on. See also vmx_control_verify(). + */ +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) +{ + struct nested_vmx_msrs *msrs = &vmcs_conf->nested; + + /* + * Note that as a general rule, the high half of the MSRs (bits in + * the control fields which may be 1) should be initialized by the + * intersection of the underlying hardware's MSR (i.e., features which + * can be supported) and the list of features we want to expose - + * because they are known to be properly supported in our code. + * Also, usually, the low half of the MSRs (bits which must be 1) can + * be set to 0, meaning that L1 may turn off any of these bits. The + * reason is that if one of these bits is necessary, it will appear + * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control + * fields of vmcs01 and vmcs02, will turn these bits off - and + * nested_vmx_l1_wants_exit() will not pass related exits to L1. + * These rules have exceptions below. + */ + nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_exit_ctls(vmcs_conf, msrs); + + nested_vmx_setup_entry_ctls(vmcs_conf, msrs); + + nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); + + nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); + + nested_vmx_setup_misc_data(vmcs_conf, msrs); + + nested_vmx_setup_basic(msrs); + + nested_vmx_setup_cr_fixed(msrs); msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); } @@ -6980,6 +7451,9 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .set_state = vmx_set_nested_state, .get_nested_state_pages = vmx_get_nested_state_pages, .write_log_dirty = nested_vmx_write_pml_buffer, +#ifdef CONFIG_KVM_HYPERV .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, + .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, +#endif }; |
