diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-09 09:39:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-09 09:39:55 -0700 |
commit | ef688f8b8cd3eb20547a6543f03e3d8952b87769 (patch) | |
tree | d44f43e19e69bc7828a16a259dd7f42afca02e99 /arch/x86/kvm/vmx | |
parent | 0e470763d84dcad27284067647dfb4b1a94dfce0 (diff) | |
parent | c59fb127583869350256656b7ed848c398bef879 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"The first batch of KVM patches, mostly covering x86.
ARM:
- Account stage2 page table allocations in memory stats
x86:
- Account EPT/NPT arm64 page table allocations in memory stats
- Tracepoint cleanups/fixes for nested VM-Enter and emulated MSR
accesses
- Drop eVMCS controls filtering for KVM on Hyper-V, all known
versions of Hyper-V now support eVMCS fields associated with
features that are enumerated to the guest
- Use KVM's sanitized VMCS config as the basis for the values of
nested VMX capabilities MSRs
- A myriad event/exception fixes and cleanups. Most notably, pending
exceptions morph into VM-Exits earlier, as soon as the exception is
queued, instead of waiting until the next vmentry. This fixed a
longstanding issue where the exceptions would incorrecly become
double-faults instead of triggering a vmexit; the common case of
page-fault vmexits had a special workaround, but now it's fixed for
good
- A handful of fixes for memory leaks in error paths
- Cleanups for VMREAD trampoline and VMX's VM-Exit assembly flow
- Never write to memory from non-sleepable kvm_vcpu_check_block()
- Selftests refinements and cleanups
- Misc typo cleanups
Generic:
- remove KVM_REQ_UNHALT"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (94 commits)
KVM: remove KVM_REQ_UNHALT
KVM: mips, x86: do not rely on KVM_REQ_UNHALT
KVM: x86: never write to memory from kvm_vcpu_check_block()
KVM: x86: Don't snapshot pending INIT/SIPI prior to checking nested events
KVM: nVMX: Make event request on VMXOFF iff INIT/SIPI is pending
KVM: nVMX: Make an event request if INIT or SIPI is pending on VM-Enter
KVM: SVM: Make an event request if INIT or SIPI is pending when GIF is set
KVM: x86: lapic does not have to process INIT if it is blocked
KVM: x86: Rename kvm_apic_has_events() to make it INIT/SIPI specific
KVM: x86: Rename and expose helper to detect if INIT/SIPI are allowed
KVM: nVMX: Make an event request when pending an MTF nested VM-Exit
KVM: x86: make vendor code check for all nested events
mailmap: Update Oliver's email address
KVM: x86: Allow force_emulation_prefix to be written without a reload
KVM: selftests: Add an x86-only test to verify nested exception queueing
KVM: selftests: Use uapi header to get VMX and SVM exit reasons/codes
KVM: x86: Rename inject_pending_events() to kvm_check_and_inject_events()
KVM: VMX: Update MTF and ICEBP comments to document KVM's subtle behavior
KVM: x86: Treat pending TRIPLE_FAULT requests as pending exceptions
KVM: x86: Morph pending exceptions to pending VM-Exits at queue time
...
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r-- | arch/x86/kvm/vmx/capabilities.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.c | 192 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/evmcs.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 468 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/sgx.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmenter.S | 24 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 321 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 172 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx_ops.h | 2 |
10 files changed, 767 insertions, 440 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c5e5dfef69c7..87c4e46daf37 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -65,6 +65,7 @@ struct vmcs_config { u64 cpu_based_3rd_exec_ctrl; u32 vmexit_ctrl; u32 vmentry_ctrl; + u64 misc; struct nested_vmx_msrs nested; }; extern struct vmcs_config vmcs_config; @@ -82,7 +83,8 @@ static inline bool cpu_has_vmx_basic_inout(void) static inline bool cpu_has_virtual_nmis(void) { - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && + vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING; } static inline bool cpu_has_vmx_preemption_timer(void) @@ -224,11 +226,8 @@ static inline bool cpu_has_vmx_vmfunc(void) static inline bool cpu_has_vmx_shadow_vmcs(void) { - u64 vmx_msr; - /* check if the cpu supports writing r/o exit information fields */ - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) + if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) return false; return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -370,10 +369,7 @@ static inline bool cpu_has_vmx_invvpid_global(void) static inline bool cpu_has_vmx_intel_pt(void) { - u64 vmx_msr; - - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); - return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) && + return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) && (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 6a61b1ae7942..d8b23c96d627 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -10,6 +10,8 @@ #include "vmx.h" #include "trace.h" +#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK + DEFINE_STATIC_KEY_FALSE(enable_evmcs); #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) @@ -28,6 +30,8 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer, HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), EVMCS1_FIELD(HOST_CR0, host_cr0, HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), EVMCS1_FIELD(HOST_CR3, host_cr3, @@ -78,6 +82,8 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer, HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0, HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1, @@ -126,6 +132,28 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap, HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier, + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2), + /* + * Not used by KVM: + * + * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x0000682A, guest_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC), + * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1), + * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1A, host_ssp, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr, + * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1), + */ /* 64 bit read only */ EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address, @@ -294,19 +322,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = { }; const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1); -#if IS_ENABLED(CONFIG_HYPERV) -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) -{ - vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL; - vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmcs_conf->cpu_based_3rd_exec_ctrl = 0; - - vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; -} -#endif - bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa) { struct hv_vp_assist_page assist_page; @@ -334,6 +349,9 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) * versions: lower 8 bits is the minimal version, higher 8 bits is the * maximum supported version. KVM supports versions from 1 to * KVM_EVMCS_VERSION. + * + * Note, do not check the Hyper-V is fully enabled in guest CPUID, this + * helper is used to _get_ the vCPU's supported CPUID. */ if (kvm_cpu_cap_get(X86_FEATURE_VMX) && (!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled)) @@ -342,10 +360,67 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) return 0; } -void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) +enum evmcs_revision { + EVMCSv1_LEGACY, + NR_EVMCS_REVISIONS, +}; + +enum evmcs_ctrl_type { + EVMCS_EXIT_CTRLS, + EVMCS_ENTRY_CTRLS, + EVMCS_2NDEXEC, + EVMCS_PINCTRL, + EVMCS_VMFUNC, + NR_EVMCS_CTRLS, +}; + +static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = { + [EVMCS_EXIT_CTRLS] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL, + }, + [EVMCS_ENTRY_CTRLS] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL, + }, + [EVMCS_2NDEXEC] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC, + }, + [EVMCS_PINCTRL] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL, + }, + [EVMCS_VMFUNC] = { + [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC, + }, +}; + +static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type) +{ + enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY; + + return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev]; +} + +static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + + /* + * PERF_GLOBAL_CTRL has a quirk where some Windows guests may fail to + * boot if a PV CPUID feature flag is not also set. Treat the fields + * as unsupported if the flag is not set in guest CPUID. This should + * be called only for guest accesses, and all guest accesses should be + * gated on Hyper-V being enabled and initialized. + */ + if (WARN_ON_ONCE(!hv_vcpu)) + return false; + + return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL; +} + +void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u32 ctl_low = (u32)*pdata; u32 ctl_high = (u32)(*pdata >> 32); + u32 unsupported_ctrls; /* * Hyper-V 2016 and 2019 try using these features even when eVMCS @@ -354,77 +429,70 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) switch (msr_index) { case MSR_IA32_VMX_EXIT_CTLS: case MSR_IA32_VMX_TRUE_EXIT_CTLS: - ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS); + if (!evmcs_has_perf_global_ctrl(vcpu)) + unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~unsupported_ctrls; break; case MSR_IA32_VMX_ENTRY_CTLS: case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS); + if (!evmcs_has_perf_global_ctrl(vcpu)) + unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ctl_high &= ~unsupported_ctrls; break; case MSR_IA32_VMX_PROCBASED_CTLS2: - ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC); break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: - ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; + ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL); break; case MSR_IA32_VMX_VMFUNC: - ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC; + ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC); break; } *pdata = ctl_low | ((u64)ctl_high << 32); } +static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type, + u32 val) +{ + return !(val & evmcs_get_unsupported_ctls(ctrl_type)); +} + int nested_evmcs_check_controls(struct vmcs12 *vmcs12) { - int ret = 0; - u32 unsupp_ctl; - - unsupp_ctl = vmcs12->pin_based_vm_exec_control & - EVMCS1_UNSUPPORTED_PINCTRL; - if (unsupp_ctl) { - trace_kvm_nested_vmenter_failed( - "eVMCS: unsupported pin-based VM-execution controls", - unsupp_ctl); - ret = -EINVAL; - } + if (CC(!nested_evmcs_is_valid_controls(EVMCS_PINCTRL, + vmcs12->pin_based_vm_exec_control))) + return -EINVAL; - unsupp_ctl = vmcs12->secondary_vm_exec_control & - EVMCS1_UNSUPPORTED_2NDEXEC; - if (unsupp_ctl) { - trace_kvm_nested_vmenter_failed( - "eVMCS: unsupported secondary VM-execution controls", - unsupp_ctl); - ret = -EINVAL; - } + if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC, + vmcs12->secondary_vm_exec_control))) + return -EINVAL; - unsupp_ctl = vmcs12->vm_exit_controls & - EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - if (unsupp_ctl) { - trace_kvm_nested_vmenter_failed( - "eVMCS: unsupported VM-exit controls", - unsupp_ctl); - ret = -EINVAL; - } + if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXIT_CTRLS, + vmcs12->vm_exit_controls))) + return -EINVAL; - unsupp_ctl = vmcs12->vm_entry_controls & - EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - if (unsupp_ctl) { - trace_kvm_nested_vmenter_failed( - "eVMCS: unsupported VM-entry controls", - unsupp_ctl); - ret = -EINVAL; - } + if (CC(!nested_evmcs_is_valid_controls(EVMCS_ENTRY_CTRLS, + vmcs12->vm_entry_controls))) + return -EINVAL; - unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC; - if (unsupp_ctl) { - trace_kvm_nested_vmenter_failed( - "eVMCS: unsupported VM-function controls", - unsupp_ctl); - ret = -EINVAL; - } + /* + * VM-Func controls are 64-bit, but KVM currently doesn't support any + * controls in bits 63:32, i.e. dropping those bits on the consistency + * check is intentional. + */ + if (WARN_ON_ONCE(vmcs12->vm_function_control >> 32)) + return -EINVAL; - return ret; + if (CC(!nested_evmcs_is_valid_controls(EVMCS_VMFUNC, + vmcs12->vm_function_control))) + return -EINVAL; + + return 0; } int nested_enable_evmcs(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index f886a8ff0342..6f746ef3c038 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -42,8 +42,6 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); * PLE_GAP = 0x00004020, * PLE_WINDOW = 0x00004022, * VMX_PREEMPTION_TIMER_VALUE = 0x0000482E, - * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, - * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, * * Currently unsupported in KVM: * GUEST_IA32_RTIT_CTL = 0x00002814, @@ -61,9 +59,8 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs); SECONDARY_EXEC_TSC_SCALING | \ SECONDARY_EXEC_PAUSE_LOOP_EXITING) #define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \ - (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) -#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) +#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0) #define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING) struct evmcs_field { @@ -212,7 +209,6 @@ static inline void evmcs_load(u64 phys_addr) vp_ap->enlighten_vmentry = 1; } -__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -243,7 +239,7 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); -void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata); +void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); int nested_evmcs_check_controls(struct vmcs12 *vmcs12); #endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7eaf96064cb0..0c62352dda6a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -439,61 +439,22 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, return inequality ^ bit; } - -/* - * KVM wants to inject page-faults which it got to the guest. This function - * checks whether in a nested guest, we need to inject them to L1 or L2. - */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) +static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, + u32 error_code) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (nr == PF_VECTOR) { - if (vcpu->arch.exception.nested_apf) { - *exit_qual = vcpu->arch.apf.nested_apf_token; - return 1; - } - if (nested_vmx_is_page_fault_vmexit(vmcs12, - vcpu->arch.exception.error_code)) { - *exit_qual = has_payload ? payload : vcpu->arch.cr2; - return 1; - } - } else if (vmcs12->exception_bitmap & (1u << nr)) { - if (nr == DB_VECTOR) { - if (!has_payload) { - payload = vcpu->arch.dr6; - payload &= ~DR6_BT; - payload ^= DR6_ACTIVE_LOW; - } - *exit_qual = payload; - } else - *exit_qual = 0; - return 1; - } - return 0; -} - -static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu, - struct x86_exception *fault) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - WARN_ON(!is_guest_mode(vcpu)); + /* + * Drop bits 31:16 of the error code when performing the #PF mask+match + * check. All VMCS fields involved are 32 bits, but Intel CPUs never + * set bits 31:16 and VMX disallows setting bits 31:16 in the injected + * error code. Including the to-be-dropped bits in the check might + * result in an "impossible" or missed exit from L1's perspective. + */ + if (vector == PF_VECTOR) + return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); - if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && - !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) { - vmcs12->vm_exit_intr_error_code = fault->error_code; - nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, - PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | - INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, - fault->address); - return true; - } - return false; + return (vmcs12->exception_bitmap & (1u << vector)); } static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, @@ -1607,6 +1568,10 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields vmcs12->guest_rflags = evmcs->guest_rflags; vmcs12->guest_interruptibility_info = evmcs->guest_interruptibility_info; + /* + * Not present in struct vmcs12: + * vmcs12->guest_ssp = evmcs->guest_ssp; + */ } if (unlikely(!(hv_clean_fields & @@ -1653,6 +1618,13 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields vmcs12->host_fs_selector = evmcs->host_fs_selector; vmcs12->host_gs_selector = evmcs->host_gs_selector; vmcs12->host_tr_selector = evmcs->host_tr_selector; + vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; + /* + * Not present in struct vmcs12: + * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; + * vmcs12->host_ssp = evmcs->host_ssp; + * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; + */ } if (unlikely(!(hv_clean_fields & @@ -1720,6 +1692,8 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields vmcs12->tsc_offset = evmcs->tsc_offset; vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; + vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; + vmcs12->tsc_multiplier = evmcs->tsc_multiplier; } if (unlikely(!(hv_clean_fields & @@ -1767,6 +1741,13 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; vmcs12->guest_activity_state = evmcs->guest_activity_state; vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; + vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; + /* + * Not present in struct vmcs12: + * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; + * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; + * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; + */ } /* @@ -1869,12 +1850,23 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; + * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; + * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; + * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; + * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; * * Not present in struct vmcs12: * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; + * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; + * evmcs->host_ssp = vmcs12->host_ssp; + * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; + * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; + * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; + * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; + * evmcs->guest_ssp = vmcs12->guest_ssp; */ evmcs->guest_es_selector = vmcs12->guest_es_selector; @@ -1982,7 +1974,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( bool evmcs_gpa_changed = false; u64 evmcs_gpa; - if (likely(!vmx->nested.enlightened_vmcs_enabled)) + if (likely(!guest_cpuid_has_evmcs(vcpu))) return EVMPTRLD_DISABLED; if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { @@ -2328,9 +2320,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). + * + * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is + * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to + * do the same for L2. */ exec_control = __vm_entry_controls_get(vmcs01); - exec_control |= vmcs12->vm_entry_controls; + exec_control |= (vmcs12->vm_entry_controls & + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) @@ -2863,7 +2860,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; - if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) + if (guest_cpuid_has_evmcs(vcpu)) return nested_evmcs_check_controls(vmcs12); return 0; @@ -3145,7 +3142,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) * L2 was running), map it here to make sure vmcs12 changes are * properly reflected. */ - if (vmx->nested.enlightened_vmcs_enabled && + if (guest_cpuid_has_evmcs(vcpu) && vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { enum nested_evmptrld_status evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, false); @@ -3364,12 +3361,24 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; + trace_kvm_nested_vmenter(kvm_rip_read(vcpu), + vmx->nested.current_vmptr, + vmcs12->guest_rip, + vmcs12->guest_intr_status, + vmcs12->vm_entry_intr_info_field, + vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, + vmcs12->ept_pointer, + vmcs12->guest_cr3, + KVM_ISA_VMX); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + if (!evaluate_pending_interrupts) + evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); if (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -3450,18 +3459,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, } /* - * If L1 had a pending IRQ/NMI until it executed - * VMLAUNCH/VMRESUME which wasn't delivered because it was - * disallowed (e.g. interrupts disabled), L0 needs to - * evaluate if this pending event should cause an exit from L2 - * to L1 or delivered directly to L2 (e.g. In case L1 don't - * intercept EXTERNAL_INTERRUPT). - * - * Usually this would be handled by the processor noticing an - * IRQ/NMI window request, or checking RVI during evaluation of - * pending virtual interrupts. However, this setting was done - * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 - * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI + * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can + * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit + * unconditionally. */ if (unlikely(evaluate_pending_interrupts)) kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -3718,7 +3719,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, is_double_fault(exit_intr_info))) { vmcs12->idt_vectoring_info_field = 0; } else if (vcpu->arch.exception.injected) { - nr = vcpu->arch.exception.nr; + nr = vcpu->arch.exception.vector; idt_vectoring = nr | VECTORING_INFO_VALID_MASK; if (kvm_exception_is_soft(nr)) { @@ -3819,19 +3820,40 @@ mmio_needed: return -ENXIO; } -static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, - unsigned long exit_qual) +static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - unsigned int nr = vcpu->arch.exception.nr; - u32 intr_info = nr | INTR_INFO_VALID_MASK; + unsigned long exit_qual; + + if (ex->has_payload) { + exit_qual = ex->payload; + } else if (ex->vector == PF_VECTOR) { + exit_qual = vcpu->arch.cr2; + } else if (ex->vector == DB_VECTOR) { + exit_qual = vcpu->arch.dr6; + exit_qual &= ~DR6_BT; + exit_qual ^= DR6_ACTIVE_LOW; + } else { + exit_qual = 0; + } - if (vcpu->arch.exception.has_error_code) { - vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; + if (ex->has_error_code) { + /* + * Intel CPUs do not generate error codes with bits 31:16 set, + * and more importantly VMX disallows setting bits 31:16 in the + * injected error code for VM-Entry. Drop the bits to mimic + * hardware and avoid inducing failure on nested VM-Entry if L1 + * chooses to inject the exception back to L2. AMD CPUs _do_ + * generate "full" 32-bit error codes, so KVM allows userspace + * to inject exception error codes with bits 31:16 set. + */ + vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (kvm_exception_is_soft(nr)) + if (kvm_exception_is_soft(ex->vector)) intr_info |= INTR_TYPE_SOFT_EXCEPTION; else intr_info |= INTR_TYPE_HARD_EXCEPTION; @@ -3844,16 +3866,39 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, } /* - * Returns true if a debug trap is pending delivery. + * Returns true if a debug trap is (likely) pending delivery. Infer the class + * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). + * Using the payload is flawed because code breakpoints (fault-like) and data + * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. + * this will return false positives if a to-be-injected code breakpoint #DB is + * pending (from KVM's perspective, but not "pending" across an instruction + * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it + * too is trap-like. * - * In KVM, debug traps bear an exception payload. As such, the class of a #DB - * exception may be inferred from the presence of an exception payload. + * KVM "works" despite these flaws as ICEBP isn't currently supported by the + * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the + * #DB has already happened), and MTF isn't marked pending on code breakpoints + * from the emulator (because such #DBs are fault-like and thus don't trigger + * actions that fire on instruction retire). + */ +static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) +{ + if (!ex->pending || ex->vector != DB_VECTOR) + return 0; + + /* General Detect #DBs are always fault-like. */ + return ex->payload & ~DR6_BD; +} + +/* + * Returns true if there's a pending #DB exception that is lower priority than + * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by + * KVM, but could theoretically be injected by userspace. Note, this code is + * imperfect, see above. */ -static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) +static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) { - return vcpu->arch.exception.pending && - vcpu->arch.exception.nr == DB_VECTOR && - vcpu->arch.exception.payload; + return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; } /* @@ -3865,9 +3910,11 @@ static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) */ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) { - if (vmx_pending_dbg_trap(vcpu)) - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vcpu->arch.exception.payload); + unsigned long pending_dbg; + + pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); + if (pending_dbg) + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); } static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) @@ -3876,21 +3923,113 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.preemption_timer_expired; } +static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) +{ + return nested_vmx_preemption_timer_pending(vcpu) || + to_vmx(vcpu)->nested.mtf_pending; +} + +/* + * Per the Intel SDM's table "Priority Among Concurrent Events", with minor + * edits to fill in missing examples, e.g. #DB due to split-lock accesses, + * and less minor edits to splice in the priority of VMX Non-Root specific + * events, e.g. MTF and NMI/INTR-window exiting. + * + * 1 Hardware Reset and Machine Checks + * - RESET + * - Machine Check + * + * 2 Trap on Task Switch + * - T flag in TSS is set (on task switch) + * + * 3 External Hardware Interventions + * - FLUSH + * - STOPCLK + * - SMI + * - INIT + * + * 3.5 Monitor Trap Flag (MTF) VM-exit[1] + * + * 4 Traps on Previous Instruction + * - Breakpoints + * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O + * breakpoint, or #DB due to a split-lock access) + * + * 4.3 VMX-preemption timer expired VM-exit + * + * 4.6 NMI-window exiting VM-exit[2] + * + * 5 Nonmaskable Interrupts (NMI) + * + * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery + * + * 6 Maskable Hardware Interrupts + * + * 7 Code Breakpoint Fault + * + * 8 Faults from Fetching Next Instruction + * - Code-Segment Limit Violation + * - Code Page Fault + * - Control protection exception (missing ENDBRANCH at target of indirect + * call or jump) + * + * 9 Faults from Decoding Next Instruction + * - Instruction length > 15 bytes + * - Invalid Opcode + * - Coprocessor Not Available + * + *10 Faults on Executing Instruction + * - Overflow + * - Bound error + * - Invalid TSS + * - Segment Not Present + * - Stack fault + * - General Protection + * - Data Page Fault + * - Alignment Check + * - x86 FPU Floating-point exception + * - SIMD floating-point exception + * - Virtualization exception + * - Control protection exception + * + * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), + * INIT signals, and higher priority events take priority over MTF VM exits. + * MTF VM exits take priority over debug-trap exceptions and lower priority + * events. + * + * [2] Debug-trap exceptions and higher priority events take priority over VM exits + * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption + * timer take priority over VM exits caused by the "NMI-window exiting" + * VM-execution control and lower priority events. + * + * [3] Debug-trap exceptions and higher priority events take priority over VM exits + * caused by "NMI-window exiting". VM exits caused by this control take + * priority over non-maskable interrupts (NMIs) and lower priority events. + * + * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to + * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, + * non-maskable interrupts (NMIs) and higher priority events take priority over + * delivery of a virtual interrupt; delivery of a virtual interrupt takes + * priority over external interrupts and lower priority events. + */ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long exit_qual; - bool block_nested_events = - vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); - bool mtf_pending = vmx->nested.mtf_pending; struct kvm_lapic *apic = vcpu->arch.apic; - + struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Clear the MTF state. If a higher priority VM-exit is delivered first, - * this state is discarded. + * Only a pending nested run blocks a pending exception. If there is a + * previously injected event, the pending exception occurred while said + * event was being delivered and thus needs to be handled. */ - if (!block_nested_events) - vmx->nested.mtf_pending = false; + bool block_nested_exceptions = vmx->nested.nested_run_pending; + /* + * New events (not exceptions) are only recognized at instruction + * boundaries. If an event needs reinjection, then KVM is handling a + * VM-Exit that occurred _during_ instruction execution; new events are + * blocked until the instruction completes. + */ + bool block_nested_events = block_nested_exceptions || + kvm_event_needs_reinjection(vcpu); if (lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &apic->pending_events)) { @@ -3900,6 +4039,9 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) clear_bit(KVM_APIC_INIT, &apic->pending_events); if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); + + /* MTF is discarded if the vCPU is in WFS. */ + vmx->nested.mtf_pending = false; return 0; } @@ -3909,31 +4051,41 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return -EBUSY; clear_bit(KVM_APIC_SIPI, &apic->pending_events); - if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, apic->sipi_vector & 0xFFUL); - return 0; + return 0; + } + /* Fallthrough, the SIPI is completely ignored. */ } /* - * Process any exceptions that are not debug traps before MTF. + * Process exceptions that are higher priority than Monitor Trap Flag: + * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but + * could theoretically come in from userspace), and ICEBP (INT1). * - * Note that only a pending nested run can block a pending exception. - * Otherwise an injected NMI/interrupt should either be - * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, - * while delivering the pending exception. + * TODO: SMIs have higher priority than MTF and trap-like #DBs (except + * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF + * across SMI/RSM as it should; that needs to be addressed in order to + * prioritize SMI over MTF and trap-like #DBs. */ - - if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { - if (vmx->nested.nested_run_pending) + if (vcpu->arch.exception_vmexit.pending && + !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { + if (block_nested_exceptions) return -EBUSY; - if (!nested_vmx_check_exception(vcpu, &exit_qual)) - goto no_vmexit; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + + nested_vmx_inject_exception_vmexit(vcpu); return 0; } - if (mtf_pending) { + if (vcpu->arch.exception.pending && + !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { + if (block_nested_exceptions) + return -EBUSY; + goto no_vmexit; + } + + if (vmx->nested.mtf_pending) { if (block_nested_events) return -EBUSY; nested_vmx_update_pending_dbg(vcpu); @@ -3941,15 +4093,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (vcpu->arch.exception.pending) { - if (vmx->nested.nested_run_pending) + if (vcpu->arch.exception_vmexit.pending) { + if (block_nested_exceptions) return -EBUSY; - if (!nested_vmx_check_exception(vcpu, &exit_qual)) - goto no_vmexit; - nested_vmx_inject_exception_vmexit(vcpu, exit_qual); + + nested_vmx_inject_exception_vmexit(vcpu); return 0; } + if (vcpu->arch.exception.pending) { + if (block_nested_exceptions) + return -EBUSY; + goto no_vmexit; + } + if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; @@ -4255,14 +4412,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } - - /* - * Drop what we picked up for L2 via vmx_complete_interrupts. It is - * preserved above and would only end up incorrectly in L1. - */ - vcpu->arch.nmi_injected = false; - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); } /* @@ -4538,6 +4687,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + /* Pending MTF traps are discarded on VM-Exit. */ + vmx->nested.mtf_pending = false; + /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -4602,6 +4754,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, WARN_ON_ONCE(nested_early_check); } + /* + * Drop events/exceptions that were queued for re-injection to L2 + * (picked up via vmx_complete_interrupts()), as well as exceptions + * that were pending for L2. Note, this must NOT be hoisted above + * prepare_vmcs12(), events/exceptions queued for re-injection need to + * be captured in vmcs12 (see vmcs12_save_pending_event()). + */ + vcpu->arch.nmi_injected = false; + kvm_clear_exception_queue(vcpu); + kvm_clear_interrupt_queue(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); /* Update any VMCS fields that might have changed while L2 ran */ @@ -5030,8 +5193,8 @@ static int handle_vmxoff(struct kvm_vcpu *vcpu) free_nested(vcpu); - /* Process a latched INIT during time CPU was in VMX operation */ - kvm_make_request(KVM_REQ_EVENT, vcpu); + if (kvm_apic_has_pending_init_or_sipi(vcpu)) + kvm_make_request(KVM_REQ_EVENT, vcpu); return nested_vmx_succeed(vcpu); } @@ -5067,7 +5230,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) * state. It is possible that the area will stay mapped as * vmx->nested.hv_evmcs but this shouldn't be a problem. */ - if (likely(!vmx->nested.enlightened_vmcs_enabled || + if (likely(!guest_cpuid_has_evmcs(vcpu) || !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); @@ -6463,6 +6626,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (ret) goto error_guest_mode; + if (vmx->nested.mtf_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; error_guest_mode: @@ -6522,8 +6688,10 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void) * bit in the high half is on if the corresponding bit in the control field * may be on. See also vmx_control_verify(). */ -void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) { + struct nested_vmx_msrs *msrs = &vmcs_conf->nested; + /* * Note that as a general rule, the high half of the MSRs (bits in * the control fields which may be 1) should be initialized by the @@ -6540,11 +6708,10 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) */ /* pin-based controls */ - rdmsr(MSR_IA32_VMX_PINBASED_CTLS, - msrs->pinbased_ctls_low, - msrs->pinbased_ctls_high); - msrs->pinbased_ctls_low |= + msrs->pinbased_ctls_low = PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; msrs->pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | @@ -6555,50 +6722,47 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) PIN_BASED_VMX_PREEMPTION_TIMER; /* exit controls */ - rdmsr(MSR_IA32_VMX_EXIT_CTLS, - msrs->exit_ctls_low, - msrs->exit_ctls_high); msrs->exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; + msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; msrs->exit_ctls_high &= #ifdef CONFIG_X86_64 VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + VM_EXIT_CLEAR_BNDCFGS; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | - VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ - rdmsr(MSR_IA32_VMX_ENTRY_CTLS, - msrs->entry_ctls_low, - msrs->entry_ctls_high); msrs->entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; msrs->entry_ctls_high &= #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | - VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; msrs->entry_ctls_high |= - (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); + (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, - msrs->procbased_ctls_low, - msrs->procbased_ctls_high); msrs->procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; + + msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; msrs->procbased_ctls_high &= CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | @@ -6632,12 +6796,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) * depend on CPUID bits, they are added later by * vmx_vcpu_after_set_cpuid. */ - if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) - rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, - msrs->secondary_ctls_low, - msrs->secondary_ctls_high); - msrs->secondary_ctls_low = 0; + + msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; msrs->secondary_ctls_high &= SECONDARY_EXEC_DESC | SECONDARY_EXEC_ENABLE_RDTSCP | @@ -6717,10 +6878,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; /* miscellaneous data */ - rdmsr(MSR_IA32_VMX_MISC, - msrs->misc_low, - msrs->misc_high); - msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; + msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; msrs->misc_low |= MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | @@ -6814,9 +6972,9 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) struct kvm_x86_nested_ops vmx_nested_ops = { .leave_nested = vmx_leave_nested, + .is_exception_vmexit = nested_vmx_is_exception_vmexit, .check_events = vmx_check_nested_events, - .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround, - .hv_timer_pending = nested_vmx_preemption_timer_pending, + .has_events = vmx_has_nested_events, .triple_fault = nested_vmx_triple_fault, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 88b00a7359e4..6312c9541c3c 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -17,7 +17,7 @@ enum nvmx_vmentry_status { }; void vmx_leave_nested(struct kvm_vcpu *vcpu); -void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); +void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index aba8cebdc587..8f95c7c01433 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -129,7 +129,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) ex.address = gva; ex.error_code_valid = true; ex.nested_page_fault = false; - kvm_inject_page_fault(vcpu, &ex); + kvm_inject_emulated_page_fault(vcpu, &ex); } else { kvm_inject_gp(vcpu, 0); } diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 6de96b943804..8477d8bdd69c 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -189,13 +189,16 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) xor %ebx, %ebx .Lclear_regs: + /* Discard @regs. The register is irrelevant, it just can't be RBX. */ + pop %_ASM_AX + /* * Clear all general purpose registers except RSP and RBX to prevent * speculative use of the guest's values, even those that are reloaded * via the stack. In theory, an L1 cache miss when restoring registers * could lead to speculative execution with the guest's values. * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially - * free. RSP and RAX are exempt as RSP is restored by hardware during + * free. RSP and RBX are exempt as RSP is restored by hardware during * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return * value. */ @@ -216,9 +219,6 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) xor %r15d, %r15d #endif - /* "POP" @regs. */ - add $WORD_SIZE, %_ASM_SP - /* * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before * the first unbalanced RET after vmexit! @@ -234,7 +234,6 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ X86_FEATURE_RSB_VMEXIT_LITE - pop %_ASM_ARG2 /* @flags */ pop %_ASM_ARG1 /* @vmx */ @@ -293,22 +292,13 @@ SYM_FUNC_START(vmread_error_trampoline) push %r10 push %r11 #endif -#ifdef CONFIG_X86_64 + /* Load @field and @fault to arg1 and arg2 respectively. */ - mov 3*WORD_SIZE(%rbp), %_ASM_ARG2 - mov 2*WORD_SIZE(%rbp), %_ASM_ARG1 -#else - /* Parameters are passed on the stack for 32-bit (see asmlinkage). */ - push 3*WORD_SIZE(%ebp) - push 2*WORD_SIZE(%ebp) -#endif + mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2 + mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1 call vmread_error -#ifndef CONFIG_X86_64 - add $8, %esp -#endif - /* Zero out @fault, which will be popped into the result register. */ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c9b49a09e6b5..9dba04b6b019 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -439,7 +439,7 @@ do { \ pr_warn_ratelimited(fmt); \ } while (0) -asmlinkage void vmread_error(unsigned long field, bool fault) +void vmread_error(unsigned long field, bool fault) { if (fault) kvm_spurious_fault(); @@ -864,7 +864,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) return flags; } -static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, +static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit) { vm_entry_controls_clearbit(vmx, entry); @@ -922,7 +922,7 @@ skip_guest: vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); } -static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, +static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, unsigned long entry, unsigned long exit, unsigned long guest_val_vmcs, unsigned long host_val_vmcs, u64 guest_val, u64 host_val) @@ -1652,17 +1652,25 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) /* * Per the SDM, MTF takes priority over debug-trap exceptions besides - * T-bit traps. As instruction emulation is completed (i.e. at the - * instruction boundary), any #DB exception pending delivery must be a - * debug-trap. Record the pending MTF state to be delivered in + * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps + * or ICEBP (in the emulator proper), and skipping of ICEBP after an + * intercepted #DB deliberately avoids single-step #DB and MTF updates + * as ICEBP is higher priority than both. As instruction emulation is + * completed at this point (i.e. KVM is at the instruction boundary), + * any #DB exception pending delivery must be a debug-trap of lower + * priority than MTF. Record the pending MTF state to be delivered in * vmx_check_nested_events(). */ if (nested_cpu_has_mtf(vmcs12) && (!vcpu->arch.exception.pending || - vcpu->arch.exception.nr == DB_VECTOR)) + vcpu->arch.exception.vector == DB_VECTOR) && + (!vcpu->arch.exception_vmexit.pending || + vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { vmx->nested.mtf_pending = true; - else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } else { vmx->nested.mtf_pending = false; + } } static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) @@ -1684,32 +1692,40 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); } -static void vmx_queue_exception(struct kvm_vcpu *vcpu) +static void vmx_inject_exception(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned nr = vcpu->arch.exception.nr; - bool has_error_code = vcpu->arch.exception.has_error_code; - u32 error_code = vcpu->arch.exception.error_code; - u32 intr_info = nr | INTR_INFO_VALID_MASK; - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, ex); - if (has_error_code) { - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + if (ex->has_error_code) { + /* + * Despite the error code being architecturally defined as 32 + * bits, and the VMCS field being 32 bits, Intel CPUs and thus + * VMX don't actually supporting setting bits 31:16. Hardware + * will (should) never provide a bogus error code, but AMD CPUs + * do generate error codes with bits 31:16 set, and so KVM's + * ABI lets userspace shove in arbitrary 32-bit values. Drop + * the upper bits to avoid VM-Fail, losing information that + * does't really exist is preferable to killing the VM. + */ + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); intr_info |= INTR_INFO_DELIVER_CODE_MASK; } if (vmx->rmode.vm86_active) { int inc_eip = 0; - if (kvm_exception_is_soft(nr)) + if (kvm_exception_is_soft(ex->vector)) inc_eip = vcpu->arch.event_exit_inst_len; - kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); + kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); return; } WARN_ON_ONCE(vmx->emulation_required); - if (kvm_exception_is_soft(nr)) { + if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); intr_info |= INTR_TYPE_SOFT_EXCEPTION; @@ -1930,9 +1946,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * sanity checking and refuse to boot. Filter all unsupported * features out. */ - if (!msr_info->host_initiated && - vmx->nested.enlightened_vmcs_enabled) - nested_evmcs_filter_control_msr(msr_info->index, + if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) + nested_evmcs_filter_control_msr(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_RTIT_CTL: @@ -2494,6 +2509,30 @@ static bool cpu_has_sgx(void) return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); } +/* + * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they + * can't be used due to errata where VM Exit may incorrectly clear + * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the + * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. + */ +static bool cpu_has_perf_global_ctrl_bug(void) +{ + if (boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ + case INTEL_FAM6_NEHALEM: /* AAP115 */ + case INTEL_FAM6_WESTMERE: /* AAT100 */ + case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ + case INTEL_FAM6_NEHALEM_EX: /* BA97 */ + return true; + default: + break; + } + } + + return false; +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -2526,13 +2565,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, struct vmx_capability *vmx_cap) { u32 vmx_msr_low, vmx_msr_high; - u32 min, opt, min2, opt2; u32 _pin_based_exec_control = 0; u32 _cpu_based_exec_control = 0; u32 _cpu_based_2nd_exec_control = 0; u64 _cpu_based_3rd_exec_control = 0; u32 _vmexit_control = 0; u32 _vmentry_control = 0; + u64 misc_msr; int i; /* @@ -2552,64 +2591,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); - min = CPU_BASED_HLT_EXITING | -#ifdef CONFIG_X86_64 - CPU_BASED_CR8_LOAD_EXITING | - CPU_BASED_CR8_STORE_EXITING | -#endif - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_UNCOND_IO_EXITING | - CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETTING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | - CPU_BASED_INVLPG_EXITING | - CPU_BASED_RDPMC_EXITING; - - opt = CPU_BASED_TPR_SHADOW | - CPU_BASED_USE_MSR_BITMAPS | - CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | - CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, - &_cpu_based_exec_control) < 0) + + if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PROCBASED_CTLS, + &_cpu_based_exec_control)) return -EIO; -#ifdef CONFIG_X86_64 - if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) - _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & - ~CPU_BASED_CR8_STORE_EXITING; -#endif if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { - min2 = 0; - opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST | - SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_DESC | - SECONDARY_EXEC_ENABLE_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_SHADOW_VMCS | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_RDSEED_EXITING | - SECONDARY_EXEC_RDRAND_EXITING | - SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_TSC_SCALING | - SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | - SECONDARY_EXEC_PT_USE_GPA | - SECONDARY_EXEC_PT_CONCEAL_VMX | - SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_BUS_LOCK_DETECTION | - SECONDARY_EXEC_NOTIFY_VM_EXITING; - if (cpu_has_sgx()) - opt2 |= SECONDARY_EXEC_ENCLS_EXITING; - if (adjust_vmx_controls(min2, opt2, + if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS2, - &_cpu_based_2nd_exec_control) < 0) + &_cpu_based_2nd_exec_control)) return -EIO; } #ifndef CONFIG_X86_64 @@ -2627,13 +2619,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &vmx_cap->ept, &vmx_cap->vpid); - if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses and invlpg don't need to cause VM Exits when EPT - enabled */ - _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_INVLPG_EXITING); - } else if (vmx_cap->ept) { + if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + vmx_cap->ept) { pr_warn_once("EPT CAP should not exist if not support " "1-setting enable EPT VM-execution control\n"); @@ -2653,32 +2640,24 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmx_cap->vpid = 0; } - if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { - u64 opt3 = TERTIARY_EXEC_IPI_VIRT; + if (!cpu_has_sgx()) + _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; - _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, + if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) + _cpu_based_3rd_exec_control = + adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, MSR_IA32_VMX_PROCBASED_CTLS3); - } - min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; -#ifdef CONFIG_X86_64 - min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; -#endif - opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_LOAD_IA32_EFER | - VM_EXIT_CLEAR_BNDCFGS | - VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, - &_vmexit_control) < 0) + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, + KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, + MSR_IA32_VMX_EXIT_CTLS, + &_vmexit_control)) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) + if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, + KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, + MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control)) return -EIO; if (cpu_has_broken_vmx_preemption_timer()) @@ -2687,15 +2666,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; - min = VM_ENTRY_LOAD_DEBUG_CONTROLS; - opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | - VM_ENTRY_LOAD_IA32_PAT | - VM_ENTRY_LOAD_IA32_EFER | - VM_ENTRY_LOAD_BNDCFGS | - VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, - &_vmentry_control) < 0) + if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, + KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, + MSR_IA32_VMX_ENTRY_CTLS, + &_vmentry_control)) return -EIO; for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { @@ -2715,30 +2689,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, _vmexit_control &= ~x_ctrl; } - /* - * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they - * can't be used due to an errata where VM Exit may incorrectly clear - * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the - * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. - */ - if (boot_cpu_data.x86 == 0x6) { - switch (boot_cpu_data.x86_model) { - case 26: /* AAK155 */ - case 30: /* AAP115 */ - case 37: /* AAT100 */ - case 44: /* BC86,AAY89,BD102 */ - case 46: /* BA97 */ - _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; - _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; - pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " - "does not work properly. Using workaround\n"); - break; - default: - break; - } - } - - rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ @@ -2755,6 +2705,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (((vmx_msr_high >> 18) & 15) != 6) return -EIO; + rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + vmcs_conf->size = vmx_msr_high & 0x1fff; vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; @@ -2766,11 +2718,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; - -#if IS_ENABLED(CONFIG_HYPERV) - if (enlightened_vmcs) - evmcs_sanitize_exec_ctrls(vmcs_conf); -#endif + vmcs_conf->misc = misc_msr; return 0; } @@ -3037,10 +2985,15 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return 0; vcpu->arch.efer = efer; +#ifdef CONFIG_X86_64 if (efer & EFER_LMA) vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); else vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); +#else + if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) + return 1; +#endif vmx_setup_uret_msrs(vmx); return 0; @@ -4327,18 +4280,37 @@ static u32 vmx_vmentry_ctrl(void) if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); + /* + * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. + */ + vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_ENTRY_LOAD_IA32_EFER | + VM_ENTRY_IA32E_MODE); + + if (cpu_has_perf_global_ctrl_bug()) + vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + + return vmentry_ctrl; } static u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + /* + * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for + * nested virtualization and thus allowed to be set in vmcs12. + */ + vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); + if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); + + if (cpu_has_perf_global_ctrl_bug()) + vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); @@ -4376,20 +4348,38 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; + /* + * Not used by KVM, but fully supported for nesting, i.e. are allowed in + * vmcs12 and propagated to vmcs02 when set in vmcs12. + */ + exec_control &= ~(CPU_BASED_RDTSC_EXITING | + CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_MONITOR_TRAP_FLAG | + CPU_BASED_PAUSE_EXITING); + + /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ + exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | + CPU_BASED_NMI_WINDOW_EXITING); + if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) exec_control &= ~CPU_BASED_MOV_DR_EXITING; - if (!cpu_need_tpr_shadow(&vmx->vcpu)) { + if (!cpu_need_tpr_shadow(&vmx->vcpu)) exec_control &= ~CPU_BASED_TPR_SHADOW; + #ifdef CONFIG_X86_64 + if (exec_control & CPU_BASED_TPR_SHADOW) + exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING); + else exec_control |= CPU_BASED_CR8_STORE_EXITING | CPU_BASED_CR8_LOAD_EXITING; #endif - } - if (!enable_ept) - exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_INVLPG_EXITING; + /* No need to intercept CR3 access or INVPLG when using EPT. */ + if (enable_ept) + exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (kvm_mwait_in_guest(vmx->vcpu.kvm)) exec_control &= ~(CPU_BASED_MWAIT_EXITING | CPU_BASED_MONITOR_EXITING); @@ -5155,8 +5145,10 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * instruction. ICEBP generates a trap-like #DB, but * despite its interception control being tied to #DB, * is an instruction intercept, i.e. the VM-Exit occurs - * on the ICEBP itself. Note, skipping ICEBP also - * clears STI and MOVSS blocking. + * on the ICEBP itself. Use the inner "skip" helper to + * avoid single-step #DB and MTF updates, as ICEBP is + * higher priority. Note, skipping ICEBP still clears + * STI and MOVSS blocking. * * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS * if single-step is enabled in RFLAGS and STI or MOVSS @@ -5638,7 +5630,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); - trace_kvm_page_fault(gpa, exit_qualification); + trace_kvm_page_fault(vcpu, gpa, exit_qualification); /* Is it a read fault? */ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) @@ -5710,7 +5702,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - (vcpu->arch.exception.pending || vcpu->arch.exception.injected); + (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) @@ -7430,7 +7422,7 @@ static int __init vmx_check_processor_compat(void) if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) return -EIO; if (nested) - nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept); + nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", smp_processor_id()); @@ -8070,7 +8062,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .patch_hypercall = vmx_patch_hypercall, .inject_irq = vmx_inject_irq, .inject_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, + .inject_exception = vmx_inject_exception, .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, @@ -8227,6 +8219,10 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) return -EIO; + if (cpu_has_perf_global_ctrl_bug()) + pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -8341,11 +8337,9 @@ static __init int hardware_setup(void) if (enable_preemption_timer) { u64 use_timer_freq = 5000ULL * 1000 * 1000; - u64 vmx_msr; - rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); cpu_preemption_timer_multi = - vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; + vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; if (tsc_khz) use_timer_freq = (u64)tsc_khz * 1000; @@ -8381,8 +8375,7 @@ static __init int hardware_setup(void) setup_default_sgx_lepubkeyhash(); if (nested) { - nested_vmx_setup_ctls_msrs(&vmcs_config.nested, - vmx_capability.ept); + nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); if (r) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 24d58c2ffaa3..a3da84f4ea45 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -477,29 +477,145 @@ static inline u8 vmx_get_rvi(void) return vmcs_read16(GUEST_INTR_STATUS) & 0xff; } -#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ -static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ -{ \ - if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ - vmcs_write##bits(uname, val); \ - vmx->loaded_vmcs->controls_shadow.lname = val; \ - } \ -} \ -static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ -{ \ - return vmcs->controls_shadow.lname; \ -} \ -static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ -{ \ - return __##lname##_controls_get(vmx->loaded_vmcs); \ -} \ -static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ -} \ -static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ -{ \ - lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ +#define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS) +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \ + VM_ENTRY_IA32E_MODE) +#else + #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \ + __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS +#endif +#define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \ + (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_ENTRY_LOAD_IA32_PAT | \ + VM_ENTRY_LOAD_IA32_EFER | \ + VM_ENTRY_LOAD_BNDCFGS | \ + VM_ENTRY_PT_CONCEAL_PIP | \ + VM_ENTRY_LOAD_IA32_RTIT_CTL) + +#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_ACK_INTR_ON_EXIT) +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \ + VM_EXIT_HOST_ADDR_SPACE_SIZE) +#else + #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ + __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS +#endif +#define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \ + (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \ + VM_EXIT_SAVE_IA32_PAT | \ + VM_EXIT_LOAD_IA32_PAT | \ + VM_EXIT_SAVE_IA32_EFER | \ + VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \ + VM_EXIT_LOAD_IA32_EFER | \ + VM_EXIT_CLEAR_BNDCFGS | \ + VM_EXIT_PT_CONCEAL_PIP | \ + VM_EXIT_CLEAR_IA32_RTIT_CTL) + +#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ + (PIN_BASED_EXT_INTR_MASK | \ + PIN_BASED_NMI_EXITING) +#define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \ + (PIN_BASED_VIRTUAL_NMIS | \ + PIN_BASED_POSTED_INTR | \ + PIN_BASED_VMX_PREEMPTION_TIMER) + +#define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (CPU_BASED_HLT_EXITING | \ + CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING | \ + CPU_BASED_UNCOND_IO_EXITING | \ + CPU_BASED_MOV_DR_EXITING | \ + CPU_BASED_USE_TSC_OFFSETTING | \ + CPU_BASED_MWAIT_EXITING | \ + CPU_BASED_MONITOR_EXITING | \ + CPU_BASED_INVLPG_EXITING | \ + CPU_BASED_RDPMC_EXITING | \ + CPU_BASED_INTR_WINDOW_EXITING) + +#ifdef CONFIG_X86_64 + #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \ + CPU_BASED_CR8_LOAD_EXITING | \ + CPU_BASED_CR8_STORE_EXITING) +#else + #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \ + __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL +#endif + +#define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \ + (CPU_BASED_RDTSC_EXITING | \ + CPU_BASED_TPR_SHADOW | \ + CPU_BASED_USE_IO_BITMAPS | \ + CPU_BASED_MONITOR_TRAP_FLAG | \ + CPU_BASED_USE_MSR_BITMAPS | \ + CPU_BASED_NMI_WINDOW_EXITING | \ + CPU_BASED_PAUSE_EXITING | \ + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \ + CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) + +#define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0 +#define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \ + (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \ + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \ + SECONDARY_EXEC_WBINVD_EXITING | \ + SECONDARY_EXEC_ENABLE_VPID | \ + SECONDARY_EXEC_ENABLE_EPT | \ + SECONDARY_EXEC_UNRESTRICTED_GUEST | \ + SECONDARY_EXEC_PAUSE_LOOP_EXITING | \ + SECONDARY_EXEC_DESC | \ + SECONDARY_EXEC_ENABLE_RDTSCP | \ + SECONDARY_EXEC_ENABLE_INVPCID | \ + SECONDARY_EXEC_APIC_REGISTER_VIRT | \ + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \ + SECONDARY_EXEC_SHADOW_VMCS | \ + SECONDARY_EXEC_XSAVES | \ + SECONDARY_EXEC_RDSEED_EXITING | \ + SECONDARY_EXEC_RDRAND_EXITING | \ + SECONDARY_EXEC_ENABLE_PML | \ + SECONDARY_EXEC_TSC_SCALING | \ + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \ + SECONDARY_EXEC_PT_USE_GPA | \ + SECONDARY_EXEC_PT_CONCEAL_VMX | \ + SECONDARY_EXEC_ENABLE_VMFUNC | \ + SECONDARY_EXEC_BUS_LOCK_DETECTION | \ + SECONDARY_EXEC_NOTIFY_VM_EXITING | \ + SECONDARY_EXEC_ENCLS_EXITING) + +#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0 +#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \ + (TERTIARY_EXEC_IPI_VIRT) + +#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \ +static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + if (vmx->loaded_vmcs->controls_shadow.lname != val) { \ + vmcs_write##bits(uname, val); \ + vmx->loaded_vmcs->controls_shadow.lname = val; \ + } \ +} \ +static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ +static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \ +{ \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ +} \ +static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ + lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \ +} \ +static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \ +{ \ + BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \ + lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \ } BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32) BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32) @@ -626,4 +742,14 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && enable_ipiv; } +static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu) +{ + /* + * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and + * eVMCS has been explicitly enabled by userspace. + */ + return vcpu->arch.hyperv_enabled && + to_vmx(vcpu)->nested.enlightened_vmcs_enabled; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 5cfc49ddb1b4..ec268df83ed6 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -10,7 +10,7 @@ #include "vmcs.h" #include "../x86.h" -asmlinkage void vmread_error(unsigned long field, bool fault); +void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, bool fault); void vmwrite_error(unsigned long field, unsigned long value); |