diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 22 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 44 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 10 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 42 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 27 |
10 files changed, 117 insertions, 49 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3fd6eec202d7..7456f9ad424b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -371,7 +371,7 @@ void kvm_set_cpu_caps(void) F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | - F(SERIALIZE) + F(SERIALIZE) | F(TSXLDTRK) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5299ef5ff18d..2f6510de6b0c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2505,9 +2505,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); val = GET_SMSTATE(u32, smstate, 0x7fcc); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7fc8); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7fc4); set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); @@ -2560,16 +2565,23 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; val = GET_SMSTATE(u32, smstate, 0x7f68); - ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1)) + return X86EMUL_UNHANDLEABLE; + val = GET_SMSTATE(u32, smstate, 0x7f60); - ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + if (ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1)) + return X86EMUL_UNHANDLEABLE; cr0 = GET_SMSTATE(u64, smstate, 0x7f58); cr3 = GET_SMSTATE(u64, smstate, 0x7f50); cr4 = GET_SMSTATE(u64, smstate, 0x7f48); ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); val = GET_SMSTATE(u64, smstate, 0x7ed0); - ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + if (ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA)) + return X86EMUL_UNHANDLEABLE; selector = GET_SMSTATE(u32, smstate, 0x7e90); rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 43fdb0c12a5d..71aa3da2a0b7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2469,7 +2469,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } if (sp->unsync_children) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count(sp); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index fb68467e6049..e90bc436f584 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -586,7 +586,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; /* Give the current vmcb to the guest */ - svm_set_gif(svm, false); nested_vmcb->save.es = vmcb->save.es; nested_vmcb->save.cs = vmcb->save.cs; @@ -632,6 +631,9 @@ int nested_svm_vmexit(struct vcpu_svm *svm) /* Restore the original control entries */ copy_vmcb_control_area(&vmcb->control, &hsave->control); + /* On vmexit the GIF is set to false */ + svm_set_gif(svm, false); + svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset; @@ -1132,6 +1134,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, load_nested_vmcb_control(svm, &ctl); nested_prepare_vmcb_control(svm); + if (!nested_svm_vmrun_msrpm(svm)) + return -EINVAL; + out_set_gif: svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); return 0; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 402dc4234e39..3c9a45efdd4d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -384,7 +384,8 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) uint8_t *page_virtual; unsigned long i; - if (npages == 0 || pages == NULL) + if (this_cpu_has(X86_FEATURE_SME_COHERENT) || npages == 0 || + pages == NULL) return; for (i = 0; i < npages; i++) { @@ -1106,6 +1107,7 @@ void sev_vm_destroy(struct kvm *kvm) list_for_each_safe(pos, q, head) { __unregister_enc_region_locked(kvm, list_entry(pos, struct enc_region, list)); + cond_resched(); } } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0194336b64a4..91ea74ae71b8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2183,6 +2183,12 @@ static int iret_interception(struct vcpu_svm *svm) return 1; } +static int invd_interception(struct vcpu_svm *svm) +{ + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_skip_emulated_instruction(&svm->vcpu); +} + static int invlpg_interception(struct vcpu_svm *svm) { if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) @@ -2774,7 +2780,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_RDPMC] = rdpmc_interception, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, - [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_INVD] = invd_interception, [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, @@ -2938,8 +2944,6 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (npt_enabled) vcpu->arch.cr3 = svm->vmcb->save.cr3; - svm_complete_interrupts(svm); - if (is_guest_mode(vcpu)) { int vmexit; @@ -3504,7 +3508,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ - exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3518,6 +3521,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + vmcb_mark_all_clean(svm->vmcb); /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) @@ -3537,7 +3541,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - vmcb_mark_all_clean(svm->vmcb); + svm_complete_interrupts(svm); + exit_fastpath = svm_exit_handlers_fastpath(vcpu); return exit_fastpath; } @@ -3900,21 +3905,28 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb *nested_vmcb; struct kvm_host_map map; - u64 guest; - u64 vmcb; int ret = 0; - guest = GET_SMSTATE(u64, smstate, 0x7ed8); - vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); + if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) { + u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); + u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8); + u64 vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); - if (guest) { - if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) - return 1; - nested_vmcb = map.hva; - ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); - kvm_vcpu_unmap(&svm->vcpu, &map, true); + if (guest) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return 1; + + if (!(saved_efer & EFER_SVME)) + return 1; + + if (kvm_vcpu_map(&svm->vcpu, + gpa_to_gfn(vmcb), &map) == -EINVAL) + return 1; + + ret = enter_svm_guest_mode(svm, vmcb, map.hva); + kvm_vcpu_unmap(&svm->vcpu, &map, true); + } } return ret; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 23b58c28a1c9..1bb6b31eb646 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4404,6 +4404,14 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); + /* + * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between + * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are + * up-to-date before switching to L1. + */ + if (enable_ept && is_pae_paging(vcpu)) + vmx_ept_load_pdptrs(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) @@ -4668,7 +4676,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high &= - ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; } } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 819c185adf09..96979c09ebd1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -129,6 +129,9 @@ static bool __read_mostly enable_preemption_timer = 1; module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #endif +extern bool __read_mostly allow_smaller_maxphyaddr; +module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); + #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ @@ -791,6 +794,18 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) eb |= get_vmcs12(vcpu)->exception_bitmap; + else { + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in + * prepare_vmcs02_rare. + */ + bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR)); + int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0; + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask); + } vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -2971,7 +2986,7 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) vpid_sync_context(to_vmx(vcpu)->vpid); } -static void ept_load_pdptrs(struct kvm_vcpu *vcpu) +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3114,7 +3129,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, guest_cr3 = vcpu->arch.cr3; else /* vmcs01.GUEST_CR3 is already up-to-date. */ update_guest_cr3 = false; - ept_load_pdptrs(vcpu); + vmx_ept_load_pdptrs(vcpu); } else { guest_cr3 = pgd; } @@ -4352,16 +4367,6 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } - - /* - * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched - * between guest and host. In that case we only care about present - * faults. - */ - if (enable_ept) { - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); - } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4803,6 +4808,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) * EPT will cause page fault only if we need to * detect illegal GPAs. */ + WARN_ON_ONCE(!allow_smaller_maxphyaddr); kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); return 1; } else @@ -5331,7 +5337,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * would also use advanced VM-exit information for EPT violations to * reconstruct the page fault error code. */ - if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + if (unlikely(allow_smaller_maxphyaddr && kvm_mmu_is_illegal_gpa(vcpu, gpa))) return kvm_emulate_instruction(vcpu, 0); return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); @@ -6054,6 +6060,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && exit_reason != EXIT_REASON_PML_FULL && + exit_reason != EXIT_REASON_APIC_ACCESS && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -8304,11 +8311,12 @@ static int __init vmx_init(void) vmx_check_vmcs12_offsets(); /* - * Intel processors don't have problems with - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable - * it for VMX by default + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default */ - allow_smaller_maxphyaddr = true; + if (!enable_ept) + allow_smaller_maxphyaddr = true; return 0; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 26175a4759fa..a0e47720f60c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -356,6 +356,7 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); int vmx_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e); +void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -551,7 +552,10 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) { - return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; + if (!enable_ept) + return true; + + return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; } void dump_vmcs(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d39d6cf1d473..ce856e0ece84 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -188,7 +188,7 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); -bool __read_mostly allow_smaller_maxphyaddr; +bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); static u64 __read_mostly host_xss; @@ -976,6 +976,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_SMEP; + unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; if (kvm_valid_cr4(vcpu, cr4)) return 1; @@ -1003,7 +1004,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (kvm_x86_ops.set_cr4(vcpu, cr4)) return 1; - if (((cr4 ^ old_cr4) & pdptr_bits) || + if (((cr4 ^ old_cr4) & mmu_role_bits) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); @@ -2731,7 +2732,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; if (!lapic_in_kernel(vcpu)) - return 1; + return data ? 1 : 0; vcpu->arch.apf.msr_en_val = data; @@ -3221,9 +3222,22 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; - case MSR_IA32_TSC: - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * On userspace reads and writes, however, we unconditionally + * operate L1's TSC value to ensure backwards-compatible + * behavior for migration. + */ + u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : + vcpu->arch.tsc_offset; + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; break; + } case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -3578,6 +3592,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); + break; default: break; } |