diff options
Diffstat (limited to 'arch/x86/kvm/vmx/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 681 |
1 files changed, 258 insertions, 423 deletions
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5c5766467a61..aa157fe5b7b3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -46,6 +46,7 @@ #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/mshyperv.h> +#include <asm/msr.h> #include <asm/mwait.h> #include <asm/spec-ctrl.h> #include <asm/vmx.h> @@ -53,6 +54,7 @@ #include <trace/events/ipi.h> #include "capabilities.h" +#include "common.h" #include "cpuid.h" #include "hyperv.h" #include "kvm_onhyperv.h" @@ -73,6 +75,8 @@ #include "vmx_onhyperv.h" #include "posted_intr.h" +#include "mmu/spte.h" + MODULE_AUTHOR("Qumranet"); MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions"); MODULE_LICENSE("GPL"); @@ -111,10 +115,10 @@ static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, 0444); module_param(enable_apicv, bool, 0444); - -bool __read_mostly enable_ipiv = true; module_param(enable_ipiv, bool, 0444); +module_param(enable_device_posted_irqs, bool, 0444); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -164,31 +168,6 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); RTIT_STATUS_BYTECNT)) /* - * List of MSRs that can be directly passed to the guest. - * In addition to these x2apic, PT and LBR MSRs are handled specially. - */ -static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { - MSR_IA32_SPEC_CTRL, - MSR_IA32_PRED_CMD, - MSR_IA32_FLUSH_CMD, - MSR_IA32_TSC, -#ifdef CONFIG_X86_64 - MSR_FS_BASE, - MSR_GS_BASE, - MSR_KERNEL_GS_BASE, - MSR_IA32_XFD, - MSR_IA32_XFD_ERR, -#endif - MSR_IA32_SYSENTER_CS, - MSR_IA32_SYSENTER_ESP, - MSR_IA32_SYSENTER_EIP, - MSR_CORE_C1_RES, - MSR_CORE_C3_RESIDENCY, - MSR_CORE_C6_RESIDENCY, - MSR_CORE_C7_RESIDENCY, -}; - -/* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. @@ -273,6 +252,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) case L1TF_MITIGATION_OFF: l1tf = VMENTER_L1D_FLUSH_NEVER; break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: @@ -380,9 +360,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); + msr = native_rdmsrq(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -393,7 +373,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrq(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) @@ -669,40 +649,6 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static int vmx_get_passthrough_msr_slot(u32 msr) -{ - int i; - - switch (msr) { - case 0x800 ... 0x8ff: - /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ - return -ENOENT; - case MSR_IA32_RTIT_STATUS: - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - case MSR_IA32_RTIT_CR3_MATCH: - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ - case MSR_LBR_SELECT: - case MSR_LBR_TOS: - case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: - case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: - case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: - case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: - case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: - /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ - return -ENOENT; - } - - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { - if (vmx_possible_passthrough_msrs[i] == msr) - return i; - } - - WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); - return -ENOENT; -} - struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -769,8 +715,11 @@ void vmx_emergency_disable_virtualization_cpu(void) return; list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), - loaded_vmcss_on_cpu_link) + loaded_vmcss_on_cpu_link) { vmcs_clear(v->vmcs); + if (v->shadow_vmcs) + vmcs_clear(v->shadow_vmcs); + } kvm_cpu_vmxoff(); } @@ -955,6 +904,10 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) flags |= VMX_RUN_SAVE_SPEC_CTRL; + if (static_branch_unlikely(&cpu_buf_vm_clear) && + kvm_vcpu_can_access_host_mmio(&vmx->vcpu)) + flags |= VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO; + return flags; } @@ -1063,7 +1016,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, * provide that period, so a CPU could write host's record into * guest's memory. */ - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } i = vmx_find_loadstore_msr_slot(&m->guest, msr); @@ -1192,13 +1145,13 @@ static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + wrmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + wrmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + wrmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + wrmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + wrmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1206,13 +1159,13 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) { u32 i; - rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); - rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); - rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); - rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); + rdmsrq(MSR_IA32_RTIT_STATUS, ctx->status); + rdmsrq(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); + rdmsrq(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); + rdmsrq(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); for (i = 0; i < addr_range; i++) { - rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); - rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); + rdmsrq(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); } } @@ -1225,9 +1178,9 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) * GUEST_IA32_RTIT_CTL is already set in the VMCS. * Save host state before VM entry. */ - rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + rdmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - wrmsrl(MSR_IA32_RTIT_CTL, 0); + wrmsrq(MSR_IA32_RTIT_CTL, 0); pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } @@ -1248,7 +1201,7 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. */ if (vmx->pt_desc.host.ctl) - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + wrmsrq(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1281,6 +1234,7 @@ void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); @@ -1309,7 +1263,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (vmx->guest_state_loaded) + if (vt->guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1330,15 +1284,15 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - vmx->msr_host_kernel_gs_base = current->thread.gsbase; + vt->msr_host_kernel_gs_base = current->thread.gsbase; } else { savesegment(fs, fs_sel); savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -1347,14 +1301,14 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); - vmx->guest_state_loaded = true; + vt->guest_state_loaded = true; } static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { struct vmcs_host_state *host_state; - if (!vmx->guest_state_loaded) + if (!vmx->vt.guest_state_loaded) return; host_state = &vmx->loaded_vmcs->host_state; @@ -1362,7 +1316,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; #ifdef CONFIG_X86_64 - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -1382,10 +1336,10 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #endif invalidate_tss_limit(); #ifdef CONFIG_X86_64 - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrq(MSR_KERNEL_GS_BASE, vmx->vt.msr_host_kernel_gs_base); #endif load_fixmap_gdt(raw_smp_processor_id()); - vmx->guest_state_loaded = false; + vmx->vt.guest_state_loaded = false; vmx->guest_uret_msrs_loaded = false; } @@ -1393,8 +1347,8 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { preempt_disable(); - if (vmx->guest_state_loaded) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + if (vmx->vt.guest_state_loaded) + rdmsrq(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); preempt_enable(); return vmx->msr_guest_kernel_gs_base; } @@ -1402,8 +1356,8 @@ static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { preempt_disable(); - if (vmx->guest_state_loaded) - wrmsrl(MSR_KERNEL_GS_BASE, data); + if (vmx->vt.guest_state_loaded) + wrmsrq(MSR_KERNEL_GS_BASE, data); preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } @@ -1441,8 +1395,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, - struct loaded_vmcs *buddy) +void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; @@ -1469,17 +1422,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, if (prev != vmx->loaded_vmcs->vmcs) { per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; vmcs_load(vmx->loaded_vmcs->vmcs); - - /* - * No indirect branch prediction barrier needed when switching - * the active VMCS within a vCPU, unless IBRS is advertised to - * the vCPU. To minimize the number of IBPBs executed, KVM - * performs IBPB on nested VM-Exit (a single nested transition - * may switch the active VMCS multiple times). - */ - if (static_branch_likely(&switch_vcpu_ibpb) && - (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))) - indirect_branch_prediction_barrier(); } if (!already_loaded) { @@ -1518,7 +1460,7 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm)) shrink_ple_window(vcpu); - vmx_vcpu_load_vmcs(vcpu, cpu, NULL); + vmx_vcpu_load_vmcs(vcpu, cpu); vmx_vcpu_pi_load(vcpu, cpu); } @@ -1579,7 +1521,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } bool vmx_get_if_flag(struct kvm_vcpu *vcpu) @@ -1699,7 +1641,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, * so that guest userspace can't DoS the guest simply by triggering * emulation (enclaves are CPL3 only). */ - if (to_vmx(vcpu)->exit_reason.enclave_mode) { + if (vmx_get_exit_reason(vcpu).enclave_mode) { kvm_queue_exception(vcpu, UD_VECTOR); return X86EMUL_PROPAGATE_FAULT; } @@ -1714,7 +1656,7 @@ int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, static int skip_emulated_instruction(struct kvm_vcpu *vcpu) { - union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); unsigned long rip, orig_rip; u32 instr_len; @@ -1861,7 +1803,7 @@ void vmx_inject_exception(struct kvm_vcpu *vcpu) return; } - WARN_ON_ONCE(vmx->emulation_required); + WARN_ON_ONCE(vmx->vt.emulation_required); if (kvm_exception_is_soft(ex->vector)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -2152,7 +2094,7 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); + msr_info->data = vmx_guest_debugctl_read(); break; default: find_uret_msr: @@ -2177,7 +2119,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, return (unsigned long)data; } -static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) +u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) { u64 debugctl = 0; @@ -2189,9 +2131,25 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (boot_cpu_has(X86_FEATURE_RTM) && + (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))) + debugctl |= DEBUGCTLMSR_RTM_DEBUG; + return debugctl; } +bool vmx_is_valid_debugctl(struct kvm_vcpu *vcpu, u64 data, bool host_initiated) +{ + u64 invalid; + + invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated); + if (invalid & (DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR)) { + kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data); + invalid &= ~(DEBUGCTLMSR_BTF | DEBUGCTLMSR_LBR); + } + return !invalid; +} + /* * Writes msr value into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -2260,29 +2218,22 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } vmcs_writel(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_DEBUGCTLMSR: { - u64 invalid; - - invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); - if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { - kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); - data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); - } - - if (invalid) + case MSR_IA32_DEBUGCTLMSR: + if (!vmx_is_valid_debugctl(vcpu, data, msr_info->host_initiated)) return 1; + data &= vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) get_vmcs12(vcpu)->guest_ia32_debugctl = data; - vmcs_write64(GUEST_IA32_DEBUGCTL, data); + vmx_guest_debugctl_write(vcpu, data); + if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && (data & DEBUGCTLMSR_LBR)) intel_pmu_create_guest_lbr_event(vcpu); return 0; - } case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported() || (!msr_info->host_initiated && @@ -2574,7 +2525,7 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) { u64 allowed; - rdmsrl(msr, allowed); + rdmsrq(msr, allowed); return ctl_opt & allowed; } @@ -2746,7 +2697,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, break; } - rdmsrl(MSR_IA32_VMX_BASIC, basic_msr); + rdmsrq(MSR_IA32_VMX_BASIC, basic_msr); /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE) @@ -2766,7 +2717,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB) return -EIO; - rdmsrl(MSR_IA32_VMX_MISC, misc_msr); + rdmsrq(MSR_IA32_VMX_MISC, misc_msr); vmcs_conf->basic = basic_msr; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -2850,7 +2801,7 @@ static int kvm_cpu_vmxon(u64 vmxon_pointer) fault: WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", - rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); cr4_clear_bits(X86_CR4_VMXE); return -EFAULT; @@ -3404,7 +3355,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } /* depends on vcpu->arch.cr0 to be set to a new value */ - vmx->emulation_required = vmx_emulation_required(vcpu); + vmx->vt.emulation_required = vmx_emulation_required(vcpu); } static int vmx_get_max_ept_level(void) @@ -3667,7 +3618,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { __vmx_set_segment(vcpu, var, seg); - to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); + to_vmx(vcpu)->vt.emulation_required = vmx_emulation_required(vcpu); } void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4016,76 +3967,29 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) vmx->nested.force_msr_bitmap_recalc = true; } -void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) +void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int idx; if (!cpu_has_vmx_msr_bitmap()) return; vmx_msr_bitmap_l01_changed(vmx); - /* - * Mark the desired intercept state in shadow bitmap, this is needed - * for resync when the MSR filters change. - */ - idx = vmx_get_passthrough_msr_slot(msr); - if (idx >= 0) { - if (type & MSR_TYPE_R) - clear_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - clear_bit(idx, vmx->shadow_msr_intercept.write); - } - - if ((type & MSR_TYPE_R) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { - vmx_set_msr_bitmap_read(msr_bitmap, msr); - type &= ~MSR_TYPE_R; - } - - if ((type & MSR_TYPE_W) && - !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { - vmx_set_msr_bitmap_write(msr_bitmap, msr); - type &= ~MSR_TYPE_W; + if (type & MSR_TYPE_R) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) + vmx_clear_msr_bitmap_read(msr_bitmap, msr); + else + vmx_set_msr_bitmap_read(msr_bitmap, msr); } - if (type & MSR_TYPE_R) - vmx_clear_msr_bitmap_read(msr_bitmap, msr); - - if (type & MSR_TYPE_W) - vmx_clear_msr_bitmap_write(msr_bitmap, msr); -} - -void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; - int idx; - - if (!cpu_has_vmx_msr_bitmap()) - return; - - vmx_msr_bitmap_l01_changed(vmx); - - /* - * Mark the desired intercept state in shadow bitmap, this is needed - * for resync when the MSR filter changes. - */ - idx = vmx_get_passthrough_msr_slot(msr); - if (idx >= 0) { - if (type & MSR_TYPE_R) - set_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - set_bit(idx, vmx->shadow_msr_intercept.write); + if (type & MSR_TYPE_W) { + if (!set && kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) + vmx_clear_msr_bitmap_write(msr_bitmap, msr); + else + vmx_set_msr_bitmap_write(msr_bitmap, msr); } - - if (type & MSR_TYPE_R) - vmx_set_msr_bitmap_read(msr_bitmap, msr); - - if (type & MSR_TYPE_W) - vmx_set_msr_bitmap_write(msr_bitmap, msr); } static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) @@ -4164,79 +4068,57 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } -void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) +void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 i; - if (!cpu_has_vmx_msr_bitmap()) return; - /* - * Redo intercept permissions for MSRs that KVM is passing through to - * the guest. Disabling interception will check the new MSR filter and - * ensure that KVM enables interception if usersepace wants to filter - * the MSR. MSRs that KVM is already intercepting don't need to be - * refreshed since KVM is going to intercept them regardless of what - * userspace wants. - */ - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { - u32 msr = vmx_possible_passthrough_msrs[i]; - - if (!test_bit(i, vmx->shadow_msr_intercept.read)) - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); - - if (!test_bit(i, vmx->shadow_msr_intercept.write)) - vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); +#ifdef CONFIG_X86_64 + vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); + if (kvm_cstate_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); + } + if (kvm_aperfmperf_in_guest(vcpu->kvm)) { + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_APERF, MSR_TYPE_R); + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } /* PT MSRs can be passed through iff PT is exposed to the guest. */ if (vmx_pt_mode_is_host_guest()) pt_update_intercept_for_msr(vcpu); -} -static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - int pi_vec) -{ -#ifdef CONFIG_SMP - if (vcpu->mode == IN_GUEST_MODE) { - /* - * The vector of the virtual has already been set in the PIR. - * Send a notification event to deliver the virtual interrupt - * unless the vCPU is the currently running vCPU, i.e. the - * event is being sent from a fastpath VM-Exit handler, in - * which case the PIR will be synced to the vIRR before - * re-entering the guest. - * - * When the target is not the running vCPU, the following - * possibilities emerge: - * - * Case 1: vCPU stays in non-root mode. Sending a notification - * event posts the interrupt to the vCPU. - * - * Case 2: vCPU exits to root mode and is still runnable. The - * PIR will be synced to the vIRR before re-entering the guest. - * Sending a notification event is ok as the host IRQ handler - * will ignore the spurious event. - * - * Case 3: vCPU exits to root mode and is blocked. vcpu_block() - * has already synced PIR to vIRR and never blocks the vCPU if - * the vIRR is not empty. Therefore, a blocked vCPU here does - * not wait for any requested interrupts in PIR, and sending a - * notification event also results in a benign, spurious event. - */ + if (vcpu->arch.xfd_no_write_intercept) + vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, MSR_TYPE_RW); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_SPEC_CTRL, MSR_TYPE_RW, + !to_vmx(vcpu)->spec_ctrl); + + if (kvm_cpu_cap_has(X86_FEATURE_XFD)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); + + if (cpu_feature_enabled(X86_FEATURE_IBPB)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, + !guest_has_pred_cmd_msr(vcpu)); + + if (cpu_feature_enabled(X86_FEATURE_FLUSH_L1D)) + vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - if (vcpu != kvm_get_running_vcpu()) - __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return; - } -#endif /* - * The vCPU isn't in the guest; wake the vCPU in case it is blocking, - * otherwise do nothing as KVM will grab the highest priority pending - * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + * x2APIC and LBR MSR intercepts are modified on-demand and cannot be + * filtered by userspace. */ - kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4287,7 +4169,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, */ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int r; r = vmx_deliver_nested_posted_interrupt(vcpu, vector); @@ -4298,20 +4180,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) if (!vcpu->arch.apic->apicv_active) return -1; - if (pi_test_and_set_pir(vector, &vmx->pi_desc)) - return 0; - - /* If a previous notification has sent the IPI, nothing to do. */ - if (pi_test_and_set_on(&vmx->pi_desc)) - return 0; - - /* - * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() - * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is - * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a - * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. - */ - kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); + __vmx_deliver_posted_interrupt(vcpu, &vt->pi_desc, vector); return 0; } @@ -4391,7 +4260,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); - rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); + rdmsrq(MSR_IA32_SYSENTER_EIP, tmpl); vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { @@ -4778,7 +4647,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); - vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->vt.pi_desc))); } if (vmx_can_use_ipiv(&vmx->vcpu)) { @@ -4850,7 +4719,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write32(GUEST_SYSENTER_CS, 0); vmcs_writel(GUEST_SYSENTER_ESP, 0); vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + vmx_guest_debugctl_write(&vmx->vcpu, 0); if (cpu_has_vmx_tpr_shadow()) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); @@ -4891,8 +4761,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR * or POSTED_INTR_WAKEUP_VECTOR. */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - __pi_set_sn(&vmx->pi_desc); + vmx->vt.pi_desc.nv = POSTED_INTR_VECTOR; + __pi_set_sn(&vmx->vt.pi_desc); } void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -5666,12 +5536,6 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } -void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ - lockdep_assert_irqs_disabled(); - set_debugreg(vcpu->arch.dr6, 6); -} - void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -5809,11 +5673,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) static int handle_ept_violation(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); gpa_t gpa; - u64 error_code; - - exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5829,23 +5690,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(vcpu, gpa, exit_qualification); - /* Is it a read fault? */ - error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) - ? PFERR_USER_MASK : 0; - /* Is it a write fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) - ? PFERR_WRITE_MASK : 0; - /* Is it a fetch fault? */ - error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) - ? PFERR_FETCH_MASK : 0; - /* ept page table entry is present? */ - error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK) - ? PFERR_PRESENT_MASK : 0; - - if (error_code & EPT_VIOLATION_GVA_IS_VALID) - error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ? - PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; - /* * Check that the GPA doesn't exceed physical memory limits, as that is * a guest page fault. We have to emulate the instruction here, because @@ -5857,7 +5701,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); - return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); + return __vmx_handle_ept_violation(vcpu, gpa, exit_qualification); } static int handle_ept_misconfig(struct kvm_vcpu *vcpu) @@ -5902,7 +5746,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->emulation_required) + if (!vmx->vt.emulation_required) return false; /* @@ -5934,7 +5778,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) intr_window_requested = exec_controls_get(vmx) & CPU_BASED_INTR_WINDOW_EXITING; - while (vmx->emulation_required && count-- != 0) { + while (vmx->vt.emulation_required && count-- != 0) { if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -6129,7 +5973,7 @@ static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) * VM-Exits. Unconditionally set the flag here and leave the handling to * vmx_handle_exit(). */ - to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + to_vt(vcpu)->exit_reason.bus_lock_detected = true; return 1; } @@ -6227,9 +6071,9 @@ void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); - *reason = vmx->exit_reason.full; + *reason = vmx->vt.exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); - if (!(vmx->exit_reason.failed_vmentry)) { + if (!(vmx->vt.exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; *intr_info = vmx_get_intr_info(vcpu); if (is_exception_with_error_code(*intr_info)) @@ -6525,7 +6369,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); - union vmx_exit_reason exit_reason = vmx->exit_reason; + union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; u16 exit_handler_index; @@ -6581,7 +6425,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * the least awful solution for the userspace case without * risking false positives. */ - if (vmx->emulation_required) { + if (vmx->vt.emulation_required) { nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); return 1; } @@ -6591,7 +6435,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) } /* If guest state is invalid, start emulating. L2 is handled above. */ - if (vmx->emulation_required) + if (vmx->vt.emulation_required) return handle_invalid_guest_state(vcpu); if (exit_reason.failed_vmentry) { @@ -6691,7 +6535,7 @@ int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * Exit to user space when bus lock detected to inform that there is * a bus lock in guest. */ - if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { + if (vmx_get_exit_reason(vcpu).bus_lock_detected) { if (ret > 0) vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; @@ -6745,7 +6589,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6970,22 +6814,22 @@ static void vmx_set_rvi(int vector) int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vcpu_vt *vt = to_vt(vcpu); int max_irr; bool got_posted_interrupt; if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; - if (pi_test_on(&vmx->pi_desc)) { - pi_clear_on(&vmx->pi_desc); + if (pi_test_on(&vt->pi_desc)) { + pi_clear_on(&vt->pi_desc); /* * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); got_posted_interrupt = - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + kvm_apic_update_irr(vcpu, vt->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); got_posted_interrupt = false; @@ -7025,14 +6869,6 @@ void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); } -void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - pi_clear_on(&vmx->pi_desc); - memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); -} - void vmx_do_interrupt_irqoff(unsigned long entry); void vmx_do_nmi_irqoff(void); @@ -7052,7 +6888,7 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) * the #NM exception. */ if (is_xfd_nm_fault(vcpu)) - rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); } static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) @@ -7089,14 +6925,12 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu, void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (vmx->emulation_required) + if (to_vt(vcpu)->emulation_required) return; - if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) + if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu)); - else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) + else if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_EXCEPTION_NMI) handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu)); } @@ -7307,7 +7141,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, return; if (flags & VMX_RUN_SAVE_SPEC_CTRL) - vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); + vmx->spec_ctrl = native_rdmsrq(MSR_IA32_SPEC_CTRL); /* * If the guest/host SPEC_CTRL values differ, restore the host value. @@ -7318,7 +7152,7 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, */ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || vmx->spec_ctrl != hostval) - native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); + native_wrmsrq(MSR_IA32_SPEC_CTRL, hostval); barrier_nospec(); } @@ -7331,10 +7165,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, * the fastpath even, all other exits must use the slow path. */ if (is_guest_mode(vcpu) && - to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + vmx_get_exit_reason(vcpu).basic != EXIT_REASON_PREEMPTION_TIMER) return EXIT_FASTPATH_NONE; - switch (to_vmx(vcpu)->exit_reason.basic) { + switch (vmx_get_exit_reason(vcpu).basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: @@ -7346,6 +7180,20 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, } } +noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu) +{ + if ((u16)vmx_get_exit_reason(vcpu).basic != EXIT_REASON_EXCEPTION_NMI || + !is_nmi(vmx_get_intr_info(vcpu))) + return; + + kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (cpu_feature_enabled(X86_FEATURE_FRED)) + fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); + else + vmx_do_nmi_irqoff(); + kvm_after_interrupt(vcpu); +} + static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, unsigned int flags) { @@ -7358,12 +7206,16 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * mitigation for MDS is done late in VMentry and is still * executed in spite of L1D Flush. This is because an extra VERW * should not matter much after the big hammer L1D Flush. + * + * cpu_buf_vm_clear is used when system is not vulnerable to MDS/TAA, + * and is affected by MMIO Stale Data. In such cases mitigation in only + * needed against an MMIO capable guest. */ if (static_branch_unlikely(&vmx_l1d_should_flush)) vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mmio_stale_data_clear) && - kvm_arch_has_assigned_device(vcpu->kvm)) - mds_clear_cpu_buffers(); + else if (static_branch_unlikely(&cpu_buf_vm_clear) && + (flags & VMX_RUN_CLEAR_CPU_BUFFERS_FOR_MMIO)) + x86_clear_cpu_buffers(); vmx_disable_fb_clear(vmx); @@ -7381,30 +7233,23 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vmx_enable_fb_clear(vmx); if (unlikely(vmx->fail)) { - vmx->exit_reason.full = 0xdead; + vmx->vt.exit_reason.full = 0xdead; goto out; } - vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); - if (likely(!vmx->exit_reason.failed_vmentry)) + vmx->vt.exit_reason.full = vmcs_read32(VM_EXIT_REASON); + if (likely(!vmx_get_exit_reason(vcpu).failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && - is_nmi(vmx_get_intr_info(vcpu))) { - kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (cpu_feature_enabled(X86_FEATURE_FRED)) - fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR); - else - vmx_do_nmi_irqoff(); - kvm_after_interrupt(vcpu); - } + vmx_handle_nmi(vcpu); out: guest_state_exit_irqoff(); } -fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) +fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) { + bool force_immediate_exit = run_flags & KVM_RUN_FORCE_IMMEDIATE_EXIT; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7418,15 +7263,15 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * start emulation until we arrive back to a valid state. Synthesize a * consistency check VM-Exit due to invalid guest state and bail. */ - if (unlikely(vmx->emulation_required)) { + if (unlikely(vmx->vt.emulation_required)) { vmx->fail = 0; - vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; - vmx->exit_reason.failed_vmentry = 1; + vmx->vt.exit_reason.full = EXIT_REASON_INVALID_STATE; + vmx->vt.exit_reason.failed_vmentry = 1; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); - vmx->exit_qualification = ENTRY_FAIL_DEFAULT; + vmx->vt.exit_qualification = ENTRY_FAIL_DEFAULT; kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); - vmx->exit_intr_info = 0; + vmx->vt.exit_intr_info = 0; return EXIT_FASTPATH_NONE; } @@ -7449,6 +7294,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); vcpu->arch.regs_dirty = 0; + if (run_flags & KVM_RUN_LOAD_GUEST_DR6) + set_debugreg(vcpu->arch.dr6, 6); + + if (run_flags & KVM_RUN_LOAD_DEBUGCTL) + vmx_reload_guest_debugctl(vcpu); + /* * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time @@ -7529,7 +7380,7 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) * checking. */ if (vmx->nested.nested_run_pending && - !vmx->exit_reason.failed_vmentry) + !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; vmx->nested.nested_run_pending = 0; @@ -7538,12 +7389,12 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) if (unlikely(vmx->fail)) return EXIT_FASTPATH_NONE; - if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) + if (unlikely((u16)vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY)) kvm_machine_check(); trace_kvm_exit(vcpu, KVM_ISA_VMX); - if (unlikely(vmx->exit_reason.failed_vmentry)) + if (unlikely(vmx_get_exit_reason(vcpu).failed_vmentry)) return EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; @@ -7575,7 +7426,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); - INIT_LIST_HEAD(&vmx->pi_wakeup_list); + INIT_LIST_HEAD(&vmx->vt.pi_wakeup_list); err = -ENOMEM; @@ -7623,26 +7474,6 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) evmcs->hv_enlightenments_control.msr_bitmap = 1; } - /* The MSR bitmap starts with all ones */ - bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); - bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); - - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); -#ifdef CONFIG_X86_64 - vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); -#endif - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); - vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); - if (kvm_cstate_in_guest(vcpu->kvm)) { - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); - vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); - } - vmx->loaded_vmcs = &vmx->vmcs01; if (cpu_need_virtualize_apic_accesses(vcpu)) { @@ -7673,7 +7504,7 @@ int vmx_vcpu_create(struct kvm_vcpu *vcpu) if (vmx_can_use_ipiv(vcpu)) WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], - __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); + __pa(&vmx->vt.pi_desc) | PID_TABLE_ENTRY_VALID); return 0; @@ -7692,7 +7523,7 @@ free_vpid: int vmx_vm_init(struct kvm *kvm) { if (!ple_gap) - kvm->arch.pause_in_guest = true; + kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE); if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { switch (l1tf_mitigation) { @@ -7700,6 +7531,7 @@ int vmx_vm_init(struct kvm *kvm) case L1TF_MITIGATION_FLUSH_NOWARN: /* 'I explicitly don't care' is set */ break; + case L1TF_MITIGATION_AUTO: case L1TF_MITIGATION_FLUSH: case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -7717,9 +7549,23 @@ int vmx_vm_init(struct kvm *kvm) break; } } + + if (enable_pml) + kvm->arch.cpu_dirty_log_size = PML_LOG_NR_ENTRIES; return 0; } +static inline bool vmx_ignore_guest_pat(struct kvm *kvm) +{ + /* + * Non-coherent DMA devices need the guest to flush CPU properly. + * In that case it is not possible to map all guest RAM as WB, so + * always trust guest PAT. + */ + return !kvm_arch_has_noncoherent_dma(kvm) && + kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT); +} + u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { /* @@ -7729,13 +7575,8 @@ u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) if (is_mmio) return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - /* - * Force WB and ignore guest PAT if the VM does NOT have a non-coherent - * device attached. Letting the guest control memory types on Intel - * CPUs may result in unexpected behavior, and so KVM's ABI is to trust - * the guest to behave only as a last resort. - */ - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) + /* Force WB if ignoring guest PAT */ + if (vmx_ignore_guest_pat(vcpu->kvm)) return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT); @@ -7919,18 +7760,6 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } } - if (kvm_cpu_cap_has(X86_FEATURE_XFD)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, - !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)); - - if (boot_cpu_has(X86_FEATURE_IBPB)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, - !guest_has_pred_cmd_msr(vcpu)); - - if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) - vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, - !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); - set_cr4_guest_host_mask(vmx); vmx_write_encls_bitmap(vcpu, NULL); @@ -7946,6 +7775,9 @@ void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_LC_ENABLED; + /* Recalc MSR interception to account for feature changes. */ + vmx_recalc_msr_intercepts(vcpu); + /* Refresh #PF interception to account for MAXPHYADDR changes. */ vmx_update_exception_bitmap(vcpu); } @@ -7959,7 +7791,7 @@ static __init u64 vmx_get_perf_capabilities(void) return 0; if (boot_cpu_has(X86_FEATURE_PDCM)) - rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { x86_perf_get_lbr(&vmx_lbr_caps); @@ -8508,7 +8340,7 @@ __init int vmx_hardware_setup(void) kvm_enable_efer_bits(EFER_NX); if (boot_cpu_has(X86_FEATURE_MPX)) { - rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + rdmsrq(MSR_IA32_BNDCFGS, host_bndcfgs); WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); } @@ -8597,6 +8429,8 @@ __init int vmx_hardware_setup(void) if (enable_ept) kvm_mmu_set_ept_masks(enable_ept_ad_bits, cpu_has_vmx_ept_execute_only()); + else + vt_x86_ops.get_mt_mask = NULL; /* * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID @@ -8614,9 +8448,6 @@ __init int vmx_hardware_setup(void) if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) enable_pml = 0; - if (!enable_pml) - vt_x86_ops.cpu_dirty_log_size = 0; - if (!cpu_has_vmx_preemption_timer()) enable_preemption_timer = false; @@ -8674,6 +8505,27 @@ __init int vmx_hardware_setup(void) kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + /* + * On Intel CPUs that lack self-snoop feature, letting the guest control + * memory types may result in unexpected behavior. So always ignore guest + * PAT on those CPUs and map VM as writeback, not allowing userspace to + * disable the quirk. + * + * On certain Intel CPUs (e.g. SPR, ICX), though self-snoop feature is + * supported, UC is slow enough to cause issues with some older guests (e.g. + * an old version of bochs driver uses ioremap() instead of ioremap_wc() to + * map the video RAM, causing wayland desktop to fail to get started + * correctly). To avoid breaking those older guests that rely on KVM to force + * memory type to WB, provide KVM_X86_QUIRK_IGNORE_GUEST_PAT to preserve the + * safer (for performance) default behavior. + * + * On top of this, non-coherent DMA devices need the guest to flush CPU + * caches properly. This also requires honoring guest PAT, and is forced + * independent of the quirk in vmx_ignore_guest_pat(). + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; return r; } @@ -8687,26 +8539,21 @@ static void vmx_cleanup_l1d_flush(void) l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; } -static void __vmx_exit(void) +void vmx_exit(void) { allow_smaller_maxphyaddr = false; vmx_cleanup_l1d_flush(); -} -static void __exit vmx_exit(void) -{ - kvm_exit(); - __vmx_exit(); kvm_x86_vendor_exit(); - } -module_exit(vmx_exit); -static int __init vmx_init(void) +int __init vmx_init(void) { int r, cpu; + KVM_SANITY_CHECK_VM_STRUCT_SIZE(kvm_vmx); + if (!kvm_is_vmx_supported()) return -EOPNOTSUPP; @@ -8747,21 +8594,9 @@ static int __init vmx_init(void) if (!enable_ept) allow_smaller_maxphyaddr = true; - /* - * Common KVM initialization _must_ come last, after this, /dev/kvm is - * exposed to userspace! - */ - r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), - THIS_MODULE); - if (r) - goto err_kvm_init; - return 0; -err_kvm_init: - __vmx_exit(); err_l1d_flush: kvm_x86_vendor_exit(); return r; } -module_init(vmx_init); |