diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/lapic.c | 29 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/mmu.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 6 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 54 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 12 |
6 files changed, 94 insertions, 64 deletions
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2098dc689088..95c6beb8ce27 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2629,19 +2629,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic->apicv_active) { - /* irr_pending is always true when apicv is activated. */ - apic->irr_pending = true; + /* + * When APICv is enabled, KVM must always search the IRR for a pending + * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU + * isn't running. If APICv is disabled, KVM _should_ search the IRR + * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, + * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching + * the IRR at this time could race with IRQ delivery from hardware that + * still sees APICv as being enabled. + * + * FIXME: Ensure other vCPUs and devices observe the change in APICv + * state prior to updating KVM's metadata caches, so that KVM + * can safely search the IRR and set irr_pending accordingly. + */ + apic->irr_pending = true; + + if (apic->apicv_active) apic->isr_count = 1; - } else { - /* - * Don't clear irr_pending, searching the IRR can race with - * updates from the CPU as APICv is still active from hardware's - * perspective. The flag will be cleared as appropriate when - * KVM injects the interrupt. - */ + else apic->isr_count = count_vectors(apic->regs + APIC_ISR); - } + apic->highest_isr_cache = -1; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9a23e058555..8e853a5fc867 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1556,6 +1556,17 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { bool flush = false; + /* + * To prevent races with vCPUs faulting in a gfn using stale data, + * zapping a gfn range must be protected by mmu_invalidate_in_progress + * (and mmu_invalidate_seq). The only exception is memslot deletion; + * in that case, SRCU synchronization ensures that SPTEs are zapped + * after all vCPUs have unlocked SRCU, guaranteeing that vCPUs see the + * invalid slot. + */ + lockdep_assert_once(kvm->mmu_invalidate_in_progress || + lockdep_is_held(&kvm->slots_lock)); + if (kvm_memslots_have_rmaps(kvm)) flush = __kvm_rmap_zap_gfn_range(kvm, range->slot, range->start, range->end, @@ -1884,14 +1895,10 @@ static bool sp_has_gptes(struct kvm_mmu_page *sp) if (is_obsolete_sp((_kvm), (_sp))) { \ } else -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ for_each_valid_sp(_kvm, _sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ - if ((_sp)->gfn != (_gfn)) {} else - -#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if (!sp_has_gptes(_sp)) {} else + if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -7063,15 +7070,15 @@ static void kvm_mmu_zap_memslot_pages_and_flush(struct kvm *kvm, /* * Since accounting information is stored in struct kvm_arch_memory_slot, - * shadow pages deletion (e.g. unaccount_shadowed()) requires that all - * gfns with a shadow page have a corresponding memslot. Do so before - * the memslot goes away. + * all MMU pages that are shadowing guest PTEs must be zapped before the + * memslot is deleted, as freeing such pages after the memslot is freed + * will result in use-after-free, e.g. in unaccount_shadowed(). */ for (i = 0; i < slot->npages; i++) { struct kvm_mmu_page *sp; gfn_t gfn = slot->base_gfn + i; - for_each_gfn_valid_sp(kvm, sp, gfn) + for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d5314cb7dff4..cf84103ce38b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -63,8 +63,12 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; + /* + * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores + * nCR3[4:0] when loading PDPTEs from memory. + */ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + (cr3 & GENMASK(11, 5)) + index * 8, 8); if (ret) return 0; return pdpte; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..92d4711fd1e4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -450,8 +450,11 @@ static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp, goto e_free; /* This needs to happen after SEV/SNP firmware initialization. */ - if (vm_type == KVM_X86_SNP_VM && snp_guest_req_init(kvm)) - goto e_free; + if (vm_type == KVM_X86_SNP_VM) { + ret = snp_guest_req_init(kvm); + if (ret) + goto e_free; + } INIT_LIST_HEAD(&sev->regions_list); INIT_LIST_HEAD(&sev->mirror_vms); @@ -530,17 +533,12 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) static int __sev_issue_cmd(int fd, int id, void *data, int *error) { - struct fd f; - int ret; + CLASS(fd, f)(fd); - f = fdget(fd); - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - ret = sev_issue_cmd_external_user(fd_file(f), id, data, error); - - fdput(f); - return ret; + return sev_issue_cmd_external_user(fd_file(f), id, data, error); } static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) @@ -2073,23 +2071,21 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) { struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; struct kvm_sev_info *src_sev, *cg_cleanup_sev; - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; bool charged = false; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto out_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto out_fput; + return ret; if (kvm->arch.vm_type != source_kvm->arch.vm_type || sev_guest(kvm) || !sev_guest(source_kvm)) { @@ -2136,8 +2132,6 @@ out_dst_cgroup: cg_cleanup_sev->misc_cg = NULL; out_unlock: sev_unlock_two_vms(kvm, source_kvm); -out_fput: - fdput(f); return ret; } @@ -2212,10 +2206,6 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (sev->snp_context) return -EINVAL; - sev->snp_context = snp_context_create(kvm, argp); - if (!sev->snp_context) - return -ENOTTY; - if (params.flags) return -EINVAL; @@ -2230,6 +2220,10 @@ static int snp_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) if (params.policy & SNP_POLICY_MASK_SINGLE_SOCKET) return -EINVAL; + sev->snp_context = snp_context_create(kvm, argp); + if (!sev->snp_context) + return -ENOTTY; + start.gctx_paddr = __psp_pa(sev->snp_context); start.policy = params.policy; memcpy(start.gosvw, params.gosvw, sizeof(params.gosvw)); @@ -2798,23 +2792,21 @@ failed: int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) { - struct fd f = fdget(source_fd); + CLASS(fd, f)(source_fd); struct kvm *source_kvm; struct kvm_sev_info *source_sev, *mirror_sev; int ret; - if (!fd_file(f)) + if (fd_empty(f)) return -EBADF; - if (!file_is_kvm(fd_file(f))) { - ret = -EBADF; - goto e_source_fput; - } + if (!file_is_kvm(fd_file(f))) + return -EBADF; source_kvm = fd_file(f)->private_data; ret = sev_lock_two_vms(kvm, source_kvm); if (ret) - goto e_source_fput; + return ret; /* * Mirrors of mirrors should work, but let's not get silly. Also @@ -2857,8 +2849,6 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd) e_unlock: sev_unlock_two_vms(kvm, source_kvm); -e_source_fput: - fdput(f); return ret; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a8e7bc04d9bf..931a7361c30f 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1197,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -2315,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -5950,6 +5964,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1a4438358c5e..d28618e9277e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -217,9 +217,11 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -/* Default is SYSTEM mode, 1 for host-guest mode */ +/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; +#ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); +#endif struct x86_pmu_lbr __ro_after_init vmx_lbr_caps; @@ -3216,7 +3218,7 @@ void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } @@ -4888,9 +4890,6 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - vmx_segment_cache_clear(vmx); - kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); - seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); @@ -4917,6 +4916,9 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); + vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); |