diff options
Diffstat (limited to 'arch/x86/kvm')
37 files changed, 2301 insertions, 1740 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ac69894eab88..619186138176 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -129,4 +129,7 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EXTERNAL_WRITE_TRACKING + bool + endif # VIRTUALIZATION diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 751aa85a3001..e19dabf1848b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -53,9 +53,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +/* + * This one is tied to SSB in the user API, and not + * visible in /proc/cpuinfo. + */ +#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */ + #define F feature_bit #define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0) + static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) { @@ -92,11 +99,45 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return 0; } -void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; + u32 function; + struct kvm_cpuid_entry2 *entry; + + vcpu->arch.kvm_cpuid_base = 0; + + for_each_possible_hypervisor_cpuid_base(function) { + entry = kvm_find_cpuid_entry(vcpu, function, 0); + + if (entry) { + u32 signature[3]; + + signature[0] = entry->ebx; + signature[1] = entry->ecx; + signature[2] = entry->edx; + + BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); + if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { + vcpu->arch.kvm_cpuid_base = function; + break; + } + } + } +} + +struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + u32 base = vcpu->arch.kvm_cpuid_base; + + if (!base) + return NULL; + + return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); +} - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); +void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); /* * save the feature bitmap to avoid cpuid lookup for every PV @@ -135,7 +176,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + best = kvm_find_kvm_cpuid_features(vcpu); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); @@ -232,6 +273,26 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } +static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + int r; + + r = kvm_check_cpuid(e2, nent); + if (r) + return r; + + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; + + kvm_update_kvm_cpuid_base(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); + + return 0; +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -268,18 +329,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, e2[i].padding[2] = 0; } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - goto out_free_cpuid; - } - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); out_free_cpuid: kvfree(e); @@ -303,20 +355,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return PTR_ERR(e2); } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - return r; - } - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); - return 0; + return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -500,7 +543,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_8000_0008_EBX, F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) | + __feature_bit(KVM_X86_FEATURE_PSFD) ); /* @@ -863,8 +907,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; + const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9a144ca8e146..28b1a4e57827 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4222,6 +4222,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx)) return X86EMUL_CONTINUE; + /* + * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE + * check however is unnecessary because CPL is always 0 outside + * protected mode. + */ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index d5124b520f76..4a555f32885a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -112,7 +112,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, if (!!auto_eoi_old == !!auto_eoi_new) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_write(&vcpu->kvm->arch.apicv_update_lock); if (auto_eoi_new) hv->synic_auto_eoi_used++; @@ -123,7 +123,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, !hv->synic_auto_eoi_used, APICV_INHIBIT_REASON_HYPERV); - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_write(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -1472,7 +1472,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1490,7 +1490,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; @@ -1754,7 +1754,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool int i; gpa_t gpa; struct kvm *kvm = vcpu->kvm; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct hv_tlb_flush_ex flush_ex; struct hv_tlb_flush flush; u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS]; @@ -1836,18 +1835,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool } } - cpumask_clear(&hv_vcpu->tlb_flush); - - vcpu_mask = all_cpus ? NULL : - sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, - vp_bitmap, vcpu_bitmap); - /* * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, - NULL, vcpu_mask, &hv_vcpu->tlb_flush); + if (all_cpus) { + kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST); + } else { + vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask, + vp_bitmap, vcpu_bitmap); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST, + vcpu_mask); + } ret_success: /* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */ diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 8c065da73f8e..816a82515dcd 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index bbd4a5d18b5d..e66e620c3bed 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -39,13 +39,13 @@ struct kvm_vcpu; struct dest_map { /* vcpu bitmap where IRQ has been sent */ - DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1); + DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS); /* * Vector sent to a given vcpu, only valid when * the vcpu's bit in map is set */ - u8 vectors[KVM_MAX_VCPU_ID + 1]; + u8 vectors[KVM_MAX_VCPU_IDS]; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 76fb00921203..759952dd1222 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2321,13 +2321,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; + u64 msr_val; int i; if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; + msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + msr_val |= MSR_IA32_APICBASE_BSP; + kvm_lapic_set_base(vcpu, msr_val); } if (!apic) @@ -2336,11 +2337,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) { - apic->base_address = APIC_DEFAULT_PHYS_BASE; - + /* The xAPIC ID is set at RESET even if the APIC was already enabled. */ + if (!init_event) kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); - } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -2481,6 +2480,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } + /* + * Stuff the APIC ENABLE bit in lieu of temporarily incrementing + * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset(). + */ + vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2852,25 +2856,30 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; + int ret; if (!IS_ALIGNED(addr, 4)) return 1; - vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; + if (data & KVM_MSR_ENABLED) { + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; - if (addr == ghc->gpa && len <= ghc->len) - new_len = ghc->len; - else - new_len = len; + ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + if (ret) + return ret; + } - return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + vcpu->arch.pv_eoi.msr_val = data; + + return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) @@ -2942,5 +2951,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) void kvm_lapic_exit(void) { static_key_deferred_flush(&apic_hw_disabled); + WARN_ON(static_branch_unlikely(&apic_hw_disabled.key)); static_key_deferred_flush(&apic_sw_disabled); + WARN_ON(static_branch_unlikely(&apic_sw_disabled.key)); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d7c25d0c1354..2b44e533fc8d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -127,7 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e9688a9f7b57..9ae6168d381e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -44,9 +44,8 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 -#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \ - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \ - X86_CR4_LA57) +#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \ + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE) #define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP) @@ -80,6 +79,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { @@ -114,17 +114,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) vcpu->arch.mmu->shadow_root_level); } -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault); +struct kvm_page_fault { + /* arguments to kvm_mmu_do_page_fault. */ + const gpa_t addr; + const u32 error_code; + const bool prefetch; + + /* Derived from error_code. */ + const bool exec; + const bool write; + const bool present; + const bool rsvd; + const bool user; + + /* Derived from mmu and global state. */ + const bool is_tdp; + const bool nx_huge_page_workaround_enabled; + + /* + * Whether a >4KB mapping can be created or is forbidden due to NX + * hugepages. + */ + bool huge_page_disallowed; + + /* + * Maximum page size that can be created for this fault; input to + * FNAME(fetch), __direct_map and kvm_tdp_mmu_map. + */ + u8 max_level; + + /* + * Page size that can be created based on the max_level and the + * page size used by the host mapping. + */ + u8 req_level; + + /* + * Page size that will be created based on the req_level and + * huge_page_disallowed. + */ + u8 goal_level; + + /* Shifted addr, or result of guest page table walk if addr is a gva. */ + gfn_t gfn; + + /* The memslot containing gfn. May be NULL. */ + struct kvm_memory_slot *slot; + + /* Outputs of kvm_faultin_pfn. */ + kvm_pfn_t pfn; + hva_t hva; + bool map_writable; +}; + +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); + +extern int nx_huge_pages; +static inline bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); +} static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 err, bool prefault) + u32 err, bool prefetch) { + struct kvm_page_fault fault = { + .addr = cr2_or_gpa, + .error_code = err, + .exec = err & PFERR_FETCH_MASK, + .write = err & PFERR_WRITE_MASK, + .present = err & PFERR_PRESENT_MASK, + .rsvd = err & PFERR_RSVD_MASK, + .user = err & PFERR_USER_MASK, + .prefetch = prefetch, + .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault), + .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(), + + .max_level = KVM_MAX_HUGEPAGE_LEVEL, + .req_level = PG_LEVEL_4K, + .goal_level = PG_LEVEL_4K, + }; #ifdef CONFIG_RETPOLINE - if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault)) - return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault); + if (fault.is_tdp) + return kvm_tdp_page_fault(vcpu, &fault); #endif - return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault); + return vcpu->arch.mmu->page_fault(vcpu, &fault); } /* @@ -230,14 +304,26 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); -static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +static inline bool kvm_shadow_root_allocated(struct kvm *kvm) { /* - * Read memslot_have_rmaps before rmap pointers. Hence, threads reading - * memslots_have_rmaps in any lock context are guaranteed to see the - * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps. + * Read shadow_root_allocated before related pointers. Hence, threads + * reading shadow_root_allocated in any lock context are guaranteed to + * see the pointers. Pairs with smp_store_release in + * mmu_first_shadow_root_alloc. */ - return smp_load_acquire(&kvm->arch.memslots_have_rmaps); + return smp_load_acquire(&kvm->arch.shadow_root_allocated); +} + +#ifdef CONFIG_X86_64 +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } +#else +static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } +#endif + +static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) +{ + return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm); } static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1a64ba5b9437..33794379949e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,7 @@ extern bool itlb_multihit_kvm_mitigation; int __read_mostly nx_huge_pages = -1; +static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; @@ -66,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, .get = param_get_bool, }; -static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { - .set = set_nx_huge_pages_recovery_ratio, +static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { + .set = set_nx_huge_pages_recovery_param, .get = param_get_uint, }; module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); __MODULE_PARM_TYPE(nx_huge_pages, "bool"); -module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, &nx_huge_pages_recovery_ratio, 0644); __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); +module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, + &nx_huge_pages_recovery_period_ms, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); static bool __read_mostly force_flush_and_sync_on_reuse; module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); @@ -1071,20 +1075,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) return kvm_mmu_memory_cache_nr_free_objects(mc); } -static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) -{ - struct kvm_memory_slot *slot; - struct kvm_mmu_page *sp; - struct kvm_rmap_head *rmap_head; - - sp = sptep_to_sp(spte); - kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); - return pte_list_add(vcpu, spte, rmap_head); -} - - static void rmap_remove(struct kvm *kvm, u64 *spte) { struct kvm_memslots *slots; @@ -1097,9 +1087,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); /* - * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the - * context of a vCPU so have to determine which memslots to use based - * on context information in sp->role. + * Unlike rmap_add, rmap_remove does not run in the context of a vCPU + * so we have to determine which memslots to use based on context + * information in sp->role. */ slots = kvm_memslots_for_spte_role(kvm, sp->role); @@ -1639,19 +1629,23 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *spte, gfn_t gfn) { - struct kvm_memory_slot *slot; - struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; + struct kvm_rmap_head *rmap_head; + int rmap_count; sp = sptep_to_sp(spte); - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + rmap_count = pte_list_add(vcpu, spte, rmap_head); - kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); - kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, - KVM_PAGES_PER_HPAGE(sp->role.level)); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) { + kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); + kvm_flush_remote_tlbs_with_address( + vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); + } } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1795,7 +1789,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 0; + return -1; } #define KVM_PAGE_ARRAY_NR 16 @@ -1909,12 +1903,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) { + int ret = vcpu->arch.mmu->sync_page(vcpu, sp); + + if (ret < 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } - return true; + return !!ret; } static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, @@ -1931,17 +1927,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, return true; } -static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, - struct list_head *invalid_list, - bool remote_flush, bool local_flush) -{ - if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush)) - return; - - if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); -} - #ifdef CONFIG_KVM_MMU_AUDIT #include "mmu_audit.c" #else @@ -2044,7 +2029,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) { - kvm_flush_remote_tlbs(vcpu->kvm); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); flush = false; } @@ -2054,7 +2039,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, mmu_pages_clear_parents(&parents); } if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); if (!can_yield) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); return -EINTR; @@ -2065,7 +2050,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu, } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); return 0; } @@ -2149,7 +2134,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, break; WARN_ON(!list_empty(&invalid_list)); - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + kvm_flush_remote_tlbs(vcpu->kvm); } __clear_sp_write_flooding_count(sp); @@ -2229,7 +2214,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, u64 spte) { - if (is_last_spte(spte, iterator->level)) { + if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { iterator->level = 0; return; } @@ -2591,7 +2576,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must * be write-protected. */ -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch) { struct kvm_mmu_page *sp; bool locked = false; @@ -2601,7 +2587,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * track machinery is used to write-protect upper-level shadow pages, * i.e. this guards the role.level == 4K assertion below! */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE)) return -EPERM; /* @@ -2617,6 +2603,9 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + if (prefetch) + return -EEXIST; + /* * TDP MMU page faults require an additional spinlock as they * run with mmu_lock held for read, not write, and the unsync @@ -2680,48 +2669,30 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) * (sp->unsync = true) * * The write barrier below ensures that 1.1 happens before 1.2 and thus - * the situation in 2.4 does not arise. The implicit barrier in 2.2 - * pairs with this write barrier. + * the situation in 2.4 does not arise. It pairs with the read barrier + * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. */ smp_wmb(); return 0; } -static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool can_unsync, bool host_writable) -{ - u64 spte; - struct kvm_mmu_page *sp; - int ret; - - sp = sptep_to_sp(sptep); - - ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative, - can_unsync, host_writable, sp_ad_disabled(sp), &spte); - - if (spte & PT_WRITABLE_MASK) - kvm_vcpu_mark_page_dirty(vcpu, gfn); - - if (*sptep == spte) - ret |= SET_SPTE_SPURIOUS; - else if (mmu_spte_update(sptep, spte)) - ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; - return ret; -} - -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned int pte_access, bool write_fault, int level, - gfn_t gfn, kvm_pfn_t pfn, bool speculative, - bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + u64 *sptep, unsigned int pte_access, gfn_t gfn, + kvm_pfn_t pfn, struct kvm_page_fault *fault) { + struct kvm_mmu_page *sp = sptep_to_sp(sptep); + int level = sp->role.level; int was_rmapped = 0; - int rmap_count; - int set_spte_ret; int ret = RET_PF_FIXED; bool flush = false; + bool wrprot; + u64 spte; + + /* Prefetching always gets a writable pfn. */ + bool host_writable = !fault || fault->map_writable; + bool prefetch = !fault || fault->prefetch; + bool write_fault = fault && fault->write; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2752,52 +2723,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, - speculative, true, host_writable); - if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { + wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, + true, host_writable, &spte); + + if (*sptep == spte) { + ret = RET_PF_SPURIOUS; + } else { + trace_kvm_mmu_set_spte(level, gfn, sptep); + flush |= mmu_spte_update(sptep, spte); + } + + if (wrprot) { if (write_fault) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) + if (flush) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, KVM_PAGES_PER_HPAGE(level)); - /* - * The fault is fully spurious if and only if the new SPTE and old SPTE - * are identical, and emulation is not required. - */ - if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) { - WARN_ON_ONCE(!was_rmapped); - return RET_PF_SPURIOUS; - } - pgprintk("%s: setting spte %llx\n", __func__, *sptep); - trace_kvm_mmu_set_spte(level, gfn, sptep); if (!was_rmapped) { + WARN_ON_ONCE(ret == RET_PF_SPURIOUS); kvm_update_page_stats(vcpu->kvm, level, 1); - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); + rmap_add(vcpu, slot, sptep, gfn); } return ret; } -static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, - bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) - return KVM_PFN_ERR_FAULT; - - return gfn_to_pfn_memslot_atomic(slot, gfn); -} - static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2818,8 +2773,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) { - mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, slot, start, access, gfn, + page_to_pfn(pages[i]), NULL); put_page(pages[i]); } @@ -2842,11 +2797,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, if (!start) continue; if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) - break; + return; start = NULL; } else if (!start) start = spte; } + if (start) + direct_pte_prefetch_many(vcpu, sp, start, spte); } static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) @@ -2924,52 +2881,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level) +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - struct kvm_memory_slot *slot; - kvm_pfn_t pfn = *pfnp; + struct kvm_memory_slot *slot = fault->slot; kvm_pfn_t mask; - int level; - *req_level = PG_LEVEL_4K; + fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; - if (unlikely(max_level == PG_LEVEL_4K)) - return PG_LEVEL_4K; + if (unlikely(fault->max_level == PG_LEVEL_4K)) + return; - if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PG_LEVEL_4K; + if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) + return; - slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); - if (!slot) - return PG_LEVEL_4K; + if (kvm_slot_dirty_track_enabled(slot)) + return; /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K || huge_page_disallowed) - return PG_LEVEL_4K; + fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->pfn, + fault->max_level); + if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) + return; /* * mmu_notifier_retry() was successful and mmu_lock is held, so * the pmd can't be split from under us. */ - mask = KVM_PAGES_PER_HPAGE(level) - 1; - VM_BUG_ON((gfn & mask) != (pfn & mask)); - *pfnp = pfn & ~mask; - - return level; + fault->goal_level = fault->req_level; + mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; + VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); + fault->pfn &= ~mask; } -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp) +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) { - int level = *goal_levelp; - - if (cur_level == level && level > PG_LEVEL_4K && + if (cur_level > PG_LEVEL_4K && + cur_level == fault->goal_level && is_shadow_present_pte(spte) && !is_large_pte(spte)) { /* @@ -2979,42 +2930,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, * patching back for them into pfn the next 9 bits of * the address. */ - u64 page_mask = KVM_PAGES_PER_HPAGE(level) - - KVM_PAGES_PER_HPAGE(level - 1); - *pfnp |= gfn & page_mask; - (*goal_levelp)--; + u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - + KVM_PAGES_PER_HPAGE(cur_level - 1); + fault->pfn |= fault->gfn & page_mask; + fault->goal_level--; } } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault, bool is_tdp) +static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; - int level, req_level, ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - gfn_t base_gfn = gfn; + int ret; + gfn_t base_gfn = fault->gfn; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); - for_each_shadow_entry(vcpu, gpa, it) { + trace_kvm_mmu_spte_requested(fault); + for_each_shadow_entry(vcpu, fault->addr, it) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; drop_large_spte(vcpu, it.sptep); @@ -3025,14 +2967,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) + if (fault->is_tdp && fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } - ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, - write, level, base_gfn, pfn, prefault, - map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -3064,18 +3008,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return -EFAULT; } -static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned int access, - int *ret_val) +static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + unsigned int access, int *ret_val) { /* The pfn is invalid, report the error! */ - if (unlikely(is_error_pfn(pfn))) { - *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); + if (unlikely(is_error_pfn(fault->pfn))) { + *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); return true; } - if (unlikely(is_noslot_pfn(pfn))) { - vcpu_cache_mmio_info(vcpu, gva, gfn, + if (unlikely(!fault->slot)) { + gva_t gva = fault->is_tdp ? 0 : fault->addr; + + vcpu_cache_mmio_info(vcpu, gva, fault->gfn, access & shadow_mmio_access_mask); /* * If MMIO caching is disabled, emulate immediately without @@ -3091,18 +3036,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, return false; } -static bool page_fault_can_be_fast(u32 error_code) +static bool page_fault_can_be_fast(struct kvm_page_fault *fault) { /* * Do not fix the mmio spte with invalid generation number which * need to be updated by slow page fault path. */ - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (fault->rsvd) return false; /* See if the page fault is due to an NX violation */ - if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) - == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + if (unlikely(fault->exec && fault->present)) return false; /* @@ -3119,9 +3063,7 @@ static bool page_fault_can_be_fast(u32 error_code) * accesses to a present page. */ - return shadow_acc_track_mask != 0 || - ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) - == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); + return shadow_acc_track_mask != 0 || (fault->write && fault->present); } /* @@ -3129,13 +3071,9 @@ static bool page_fault_can_be_fast(u32 error_code) * someone else modified the SPTE from its original value. */ static bool -fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, u64 new_spte) { - gfn_t gfn; - - WARN_ON(!sp->role.direct); - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in @@ -3151,24 +3089,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { - /* - * The gfn of direct spte is stable since it is - * calculated by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - kvm_vcpu_mark_page_dirty(vcpu, gfn); - } + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) + mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); return true; } -static bool is_access_allowed(u32 fault_err_code, u64 spte) +static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) { - if (fault_err_code & PFERR_FETCH_MASK) + if (fault->exec) return is_executable_pte(spte); - if (fault_err_code & PFERR_WRITE_MASK) + if (fault->write) return is_writable_pte(spte); /* Fault was on Read access */ @@ -3193,9 +3125,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { sptep = iterator.sptep; *spte = old_spte; - - if (!is_shadow_present_pte(old_spte)) - break; } return sptep; @@ -3204,7 +3133,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) /* * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; @@ -3212,7 +3141,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 *sptep = NULL; uint retry_count = 0; - if (!page_fault_can_be_fast(error_code)) + if (!page_fault_can_be_fast(fault)) return ret; walk_shadow_page_lockless_begin(vcpu); @@ -3221,9 +3150,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) u64 new_spte; if (is_tdp_mmu(vcpu->arch.mmu)) - sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); else - sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); + sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); if (!is_shadow_present_pte(spte)) break; @@ -3242,7 +3171,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_access_allowed(error_code, spte)) { + if (is_access_allowed(fault, spte)) { ret = RET_PF_SPURIOUS; break; } @@ -3257,28 +3186,28 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * be removed in the fast path only if the SPTE was * write-protected for dirty-logging or access tracking. */ - if ((error_code & PFERR_WRITE_MASK) && + if (fault->write && spte_can_locklessly_be_made_writable(spte)) { new_spte |= PT_WRITABLE_MASK; /* - * Do not fix write-permission on the large spte. Since - * we only dirty the first page into the dirty-bitmap in + * Do not fix write-permission on the large spte when + * dirty logging is enabled. Since we only dirty the + * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PG_LEVEL_4K) + if (sp->role.level > PG_LEVEL_4K && + kvm_slot_dirty_track_enabled(fault->slot)) break; } /* Verify that the fault can be handled in the fast path */ if (new_spte == spte || - !is_access_allowed(error_code, new_spte)) + !is_access_allowed(fault, new_spte)) break; /* @@ -3286,7 +3215,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3299,7 +3228,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) } while (true); - trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); + trace_fast_page_fault(vcpu, fault, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3472,6 +3401,67 @@ out_unlock: return r; } +static int mmu_first_shadow_root_alloc(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i; + + /* + * Check if this is the first shadow root being allocated before + * taking the lock. + */ + if (kvm_shadow_root_allocated(kvm)) + return 0; + + mutex_lock(&kvm->slots_arch_lock); + + /* Recheck, under the lock, whether this is the first shadow root. */ + if (kvm_shadow_root_allocated(kvm)) + goto out_unlock; + + /* + * Check if anything actually needs to be allocated, e.g. all metadata + * will be allocated upfront if TDP is disabled. + */ + if (kvm_memslots_have_rmaps(kvm) && + kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, slots) { + /* + * Both of these functions are no-ops if the target is + * already allocated, so unconditionally calling both + * is safe. Intentionally do NOT free allocations on + * failure to avoid having to track which allocations + * were made now versus when the memslot was created. + * The metadata is guaranteed to be freed when the slot + * is freed, and will be kept/used if userspace retries + * KVM_RUN instead of killing the VM. + */ + r = memslot_rmap_alloc(slot, slot->npages); + if (r) + goto out_unlock; + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + + /* + * Ensure that shadow_root_allocated becomes true strictly after + * all the related pointers are set. + */ +out_success: + smp_store_release(&kvm->arch.shadow_root_allocated, true); + +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -3502,7 +3492,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) } } - r = alloc_all_memslots_rmaps(vcpu->kvm); + r = mmu_first_shadow_root_alloc(vcpu->kvm); if (r) return r; @@ -3653,6 +3643,33 @@ err_pml4: #endif } +static bool is_unsync_root(hpa_t root) +{ + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(root)) + return false; + + /* + * The read barrier orders the CPU's read of SPTE.W during the page table + * walk before the reads of sp->unsync/sp->unsync_children here. + * + * Even if another CPU was marking the SP as unsync-ed simultaneously, + * any guest page table changes are not guaranteed to be visible anyway + * until this VCPU issues a TLB flush strictly after those changes are + * made. We only need to ensure that the other CPU sets these flags + * before any actual changes to the page tables are made. The comments + * in mmu_try_to_unsync_pages() describe what could go wrong if this + * requirement isn't satisfied. + */ + smp_rmb(); + sp = to_shadow_page(root); + if (sp->unsync || sp->unsync_children) + return true; + + return false; +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; @@ -3670,18 +3687,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu->root_hpa; sp = to_shadow_page(root); - /* - * Even if another CPU was marking the SP as unsync-ed - * simultaneously, any guest page table changes are not - * guaranteed to be visible anyway until this VCPU issues a TLB - * flush strictly after those changes are made. We only need to - * ensure that the other CPU sets these flags before any actual - * changes to the page tables are made. The comments in - * mmu_try_to_unsync_pages() describe what could go wrong if - * this requirement isn't satisfied. - */ - if (!smp_load_acquire(&sp->unsync) && - !smp_load_acquire(&sp->unsync_children)) + if (!is_unsync_root(root)) return; write_lock(&vcpu->kvm->mmu_lock); @@ -3711,6 +3717,19 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) write_unlock(&vcpu->kvm->mmu_lock); } +void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) +{ + unsigned long roots_to_free = 0; + int i; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + /* sync prev_roots by simply freeing them */ + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr, u32 access, struct x86_exception *exception) { @@ -3763,9 +3782,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level spte = mmu_spte_get_lockless(iterator.sptep); sptes[leaf] = spte; - - if (!is_shadow_present_pte(spte)) - break; } return leaf; @@ -3856,20 +3872,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, - u32 error_code, gfn_t gfn) + struct kvm_page_fault *fault) { - if (unlikely(error_code & PFERR_RSVD_MASK)) + if (unlikely(fault->rsvd)) return false; - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) + if (!fault->present || !fault->write) return false; /* * guest is writing the page which is write tracked which can * not be fixed by page fault handler. */ - if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) return true; return false; @@ -3881,11 +3896,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) u64 spte; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) clear_sp_write_flooding_count(iterator.sptep); - if (!is_shadow_present_pte(spte)) - break; - } walk_shadow_page_lockless_end(vcpu); } @@ -3903,11 +3915,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable, int *r) +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + struct kvm_memory_slot *slot = fault->slot; bool async; /* @@ -3921,8 +3931,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (!kvm_is_visible_memslot(slot)) { /* Don't expose private memslots to L2. */ if (is_guest_mode(vcpu)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; + fault->slot = NULL; + fault->pfn = KVM_PFN_NOSLOT; + fault->map_writable = false; return false; } /* @@ -3939,46 +3950,46 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, } async = false; - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return false; /* *pfn has correct page already */ - if (!prefault && kvm_can_do_async_pf(vcpu)) { - trace_kvm_try_async_get_page(cr2_or_gpa, gfn); - if (kvm_find_async_pf_gfn(vcpu, gfn)) { - trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); + if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(fault->addr, fault->gfn); + if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { + trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); goto out_retry; - } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) + } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) goto out_retry; } - *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, - write, writable, hva); + fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, + fault->write, &fault->map_writable, + &fault->hva); + return false; out_retry: *r = RET_PF_RETRY; return true; } -static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault, int max_level, bool is_tdp) +static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); - bool write = error_code & PFERR_WRITE_MASK; - bool map_writable; - gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; - kvm_pfn_t pfn; - hva_t hva; int r; - if (page_fault_handle_page_track(vcpu, error_code, gfn)) + fault->gfn = fault->addr >> PAGE_SHIFT; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) return RET_PF_EMULATE; - r = fast_page_fault(vcpu, gpa, error_code); + r = fast_page_fault(vcpu, fault); if (r != RET_PF_INVALID) return r; @@ -3989,11 +4000,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) + if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) return r; r = RET_PF_RETRY; @@ -4003,36 +4013,34 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, else write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; if (is_tdp_mmu_fault) - r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level, - pfn, prefault); + r = kvm_tdp_mmu_map(vcpu, fault); else - r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn, - prefault, is_tdp); + r = __direct_map(vcpu, fault); out_unlock: if (is_tdp_mmu_fault) read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } -static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, - u32 error_code, bool prefault) +static int nonpaging_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { - pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code); + pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ - return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PG_LEVEL_2M, false); + fault->max_level = PG_LEVEL_2M; + return direct_page_fault(vcpu, fault); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4068,23 +4076,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); -int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - bool prefault) +int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - int max_level; - - for (max_level = KVM_MAX_HUGEPAGE_LEVEL; - max_level > PG_LEVEL_4K; - max_level--) { - int page_num = KVM_PAGES_PER_HPAGE(max_level); - gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); + while (fault->max_level > PG_LEVEL_4K) { + int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); + gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) break; + + --fault->max_level; } - return direct_page_fault(vcpu, gpa, error_code, prefault, - max_level, true); + return direct_page_fault(vcpu, fault); } static void nonpaging_init_context(struct kvm_mmu *context) @@ -4205,7 +4209,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu) } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned int access, int *nr_present) + unsigned int access) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -4213,7 +4217,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return true; } - (*nr_present)++; mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -4596,10 +4599,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu) unsigned bit; bool wp; - if (!is_cr4_pke(mmu)) { - mmu->pkru_mask = 0; + mmu->pkru_mask = 0; + + if (!is_cr4_pke(mmu)) return; - } wp = is_cr0_wp(mmu); @@ -5212,7 +5215,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush; + bool flush = false; /* * If we don't have indirect shadow pages, it means no page is @@ -5221,8 +5224,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - remote_flush = local_flush = false; - pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); /* @@ -5251,18 +5252,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!spte) continue; - local_flush = true; while (npte--) { entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) - remote_flush = true; + flush = true; ++spte; } } - kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); + kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); write_unlock(&vcpu->kvm->mmu_lock); } @@ -5473,8 +5473,8 @@ slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, - slot_level_handler fn, bool flush_on_yield) +slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, + slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield); @@ -5694,13 +5694,7 @@ void kvm_mmu_init_vm(struct kvm *kvm) spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); - if (!kvm_mmu_init_tdp_mmu(kvm)) - /* - * No smp_load/store wrappers needed here as we are in - * VM init and there cannot be any memslots / other threads - * accessing this struct kvm yet. - */ - kvm->arch.memslots_have_rmaps = true; + kvm_mmu_init_tdp_mmu(kvm); node->track_write = kvm_mmu_pte_write; node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; @@ -5716,55 +5710,58 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + const struct kvm_memory_slot *memslot; + struct kvm_memslots *slots; + bool flush = false; + gfn_t start, end; + int i; + + if (!kvm_memslots_have_rmaps(kvm)) + return flush; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, + start, end - 1, true, flush); + } + } + + return flush; +} + /* * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end * (not including it) */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; + bool flush; int i; - bool flush = false; write_lock(&kvm->mmu_lock); kvm_inc_notifier_count(kvm, gfn_start, gfn_end); - if (kvm_memslots_have_rmaps(kvm)) { - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) { - gfn_t start, end; - - start = max(gfn_start, memslot->base_gfn); - end = min(gfn_end, memslot->base_gfn + memslot->npages); - if (start >= end) - continue; - - flush = slot_handle_level_range(kvm, - (const struct kvm_memory_slot *) memslot, - kvm_zap_rmapp, PG_LEVEL_4K, - KVM_MAX_HUGEPAGE_LEVEL, start, - end - 1, true, flush); - } - } - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); - } + flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); if (is_tdp_mmu_enabled(kvm)) { for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, gfn_end, flush); - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end - gfn_start); } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); kvm_dec_notifier_count(kvm, gfn_start, gfn_end); @@ -5860,7 +5857,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true); + /* + * Zap only 4k SPTEs since the legacy MMU only supports dirty + * logging at a 4k granularity and never creates collapsible + * 2m SPTEs during dirty logging. + */ + flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); @@ -5897,8 +5899,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, - false); + /* + * Clear dirty bits only on 4k SPTEs since the legacy MMU only + * support dirty logging at a 4k granularity. + */ + flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } @@ -6176,18 +6181,24 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } -static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { - unsigned int old_val; + bool was_recovery_enabled, is_recovery_enabled; + uint old_period, new_period; int err; - old_val = nx_huge_pages_recovery_ratio; + was_recovery_enabled = nx_huge_pages_recovery_ratio; + old_period = nx_huge_pages_recovery_period_ms; + err = param_set_uint(val, kp); if (err) return err; - if (READ_ONCE(nx_huge_pages) && - !old_val && nx_huge_pages_recovery_ratio) { + is_recovery_enabled = nx_huge_pages_recovery_ratio; + new_period = nx_huge_pages_recovery_period_ms; + + if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; mutex_lock(&kvm_lock); @@ -6250,8 +6261,17 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) - ? start_time + 60 * HZ - get_jiffies_64() + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + + if (!period && ratio) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + period = 60 * 60 * 1000 / ratio; + } + + return READ_ONCE(nx_huge_pages) && ratio + ? start_time + msecs_to_jiffies(period) - get_jiffies_64() : MAX_SCHEDULE_TIMEOUT; } diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index bf2bdbf333c2..52c6527b1a06 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -118,13 +118,8 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) kvm_x86_ops.cpu_dirty_log_size; } -extern int nx_huge_pages; -static inline bool is_nx_huge_page_enabled(void) -{ - return READ_ONCE(nx_huge_pages); -} - -int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); +int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, + gfn_t gfn, bool can_unsync, bool prefetch); void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); @@ -155,19 +150,11 @@ enum { RET_PF_SPURIOUS, }; -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t pfn, int max_level); -int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, - int max_level, kvm_pfn_t *pfnp, - bool huge_page_disallowed, int *req_level); -void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level, - kvm_pfn_t *pfnp, int *goal_levelp); +void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); +void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index 2924a4081a19..b8151bbca36a 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -252,9 +252,9 @@ TRACE_EVENT( TRACE_EVENT( fast_page_fault, - TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code, + TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, u64 *sptep, u64 old_spte, int ret), - TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret), + TP_ARGS(vcpu, fault, sptep, old_spte, ret), TP_STRUCT__entry( __field(int, vcpu_id) @@ -268,8 +268,8 @@ TRACE_EVENT( TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->cr2_or_gpa = cr2_or_gpa; - __entry->error_code = error_code; + __entry->cr2_or_gpa = fault->addr; + __entry->error_code = fault->error_code; __entry->sptep = sptep; __entry->old_spte = old_spte; __entry->new_spte = *sptep; @@ -367,8 +367,8 @@ TRACE_EVENT( TRACE_EVENT( kvm_mmu_spte_requested, - TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn), - TP_ARGS(addr, level, pfn), + TP_PROTO(struct kvm_page_fault *fault), + TP_ARGS(fault), TP_STRUCT__entry( __field(u64, gfn) @@ -377,9 +377,9 @@ TRACE_EVENT( ), TP_fast_assign( - __entry->gfn = addr >> PAGE_SHIFT; - __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); - __entry->level = level; + __entry->gfn = fault->gfn; + __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1)); + __entry->level = fault->goal_level; ), TP_printk("gfn %llx pfn %llx level %d", diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 21427e84a82e..cc4eb5b7fb76 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -19,6 +19,12 @@ #include "mmu.h" #include "mmu_internal.h" +bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) +{ + return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || + !tdp_enabled || kvm_shadow_root_allocated(kvm); +} + void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { int i; @@ -29,12 +35,17 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) } } -int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, +int kvm_page_track_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages) { - int i; + int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + if (i == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm)) + continue; + slot->arch.gfn_track[i] = kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL_ACCOUNT); @@ -57,6 +68,21 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) return true; } +int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot) +{ + unsigned short *gfn_track; + + if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE]) + return 0; + + gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT); + if (gfn_track == NULL) + return -ENOMEM; + + slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track; + return 0; +} + static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { @@ -92,6 +118,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, 1); /* @@ -126,6 +156,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, if (WARN_ON(!page_track_mode_is_valid(mode))) return; + if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(kvm))) + return; + update_gfn_track(slot, gfn, mode, -1); /* @@ -139,19 +173,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ -bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, - enum kvm_page_track_mode mode) +bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; + if (mode == KVM_PAGE_TRACK_WRITE && + !kvm_page_track_write_tracking_enabled(vcpu->kvm)) + return false; + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 913d52a7923e..f87d36898c44 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -561,6 +561,7 @@ static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte, bool no_dirty_log) { + struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; kvm_pfn_t pfn; @@ -573,30 +574,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); - if (is_error_pfn(pfn)) + if (!slot) return false; - /* - * we call mmu_set_spte() with host_writable = true because - * pte_prefetch_gfn_to_pfn always gets a writable pfn. - */ - mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn, - true, true); + pfn = gfn_to_pfn_memslot_atomic(slot, gfn); + if (is_error_pfn(pfn)) + return false; + mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); kvm_release_pfn_clean(pfn); return true; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) -{ - pt_element_t gpte = *(const pt_element_t *)pte; - - FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); -} - static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { @@ -663,21 +655,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ -static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, - struct guest_walker *gw, u32 error_code, - int max_level, kvm_pfn_t pfn, bool map_writable, - bool prefault) +static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + struct guest_walker *gw) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write_fault = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned int direct_access, access; - int top_level, level, req_level, ret; - gfn_t base_gfn = gw->gfn; + int top_level, ret; + gfn_t base_gfn = fault->gfn; + WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; top_level = vcpu->arch.mmu->root_level; @@ -695,7 +682,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) goto out_gpte_changed; - for (shadow_walk_init(&it, vcpu, addr); + for (shadow_walk_init(&it, vcpu, fault->addr); shadow_walk_okay(&it) && it.level > gw->level; shadow_walk_next(&it)) { gfn_t table_gfn; @@ -707,7 +694,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, if (!is_shadow_present_pte(*it.sptep)) { table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; - sp = kvm_mmu_get_page(vcpu, table_gfn, addr, + sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr, it.level-1, false, access); /* * We must synchronize the pagetable before linking it @@ -741,10 +728,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, link_shadow_page(vcpu, it.sptep, sp); } - level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(addr, gw->level, pfn); + trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); @@ -753,12 +739,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level, - &pfn, &level); + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, *it.sptep, it.level); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); - if (it.level == level) + base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); @@ -766,16 +751,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr, drop_large_spte(vcpu, it.sptep); if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, addr, + sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); - if (huge_page_disallowed && req_level >= it.level) + if (fault->huge_page_disallowed && + fault->req_level >= it.level) account_huge_nx_page(vcpu->kvm, sp); } } - ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, - it.level, base_gfn, pfn, prefault, map_writable); + if (WARN_ON_ONCE(it.level != fault->goal_level)) + return -EFAULT; + + ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, + base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; @@ -841,45 +830,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, - bool prefault) +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool write_fault = error_code & PFERR_WRITE_MASK; - bool user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; int r; - kvm_pfn_t pfn; - hva_t hva; unsigned long mmu_seq; - bool map_writable, is_self_change_mapping; - int max_level; + bool is_self_change_mapping; - pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); + pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); + WARN_ON_ONCE(fault->is_tdp); /* + * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ - error_code &= ~PFERR_RSVD_MASK; - - /* - * Look up the guest pte for the faulting address. - */ - r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); + r = FNAME(walk_addr)(&walker, vcpu, fault->addr, + fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - if (!prefault) + if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } - if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { - shadow_page_table_clear_flood(vcpu, addr); + fault->gfn = walker.gfn; + fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); + + if (page_fault_handle_page_track(vcpu, fault)) { + shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_EMULATE; } @@ -890,29 +874,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, - &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable); if (is_self_change_mapping) - max_level = PG_LEVEL_4K; + fault->max_level = PG_LEVEL_4K; else - max_level = walker.level; + fault->max_level = walker.level; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable, &r)) + if (kvm_faultin_pfn(vcpu, fault, &r)) return r; - if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) + if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) return r; /* * Do not change pte_access if the pfn is a mmio page, otherwise * we will cache the incorrect access into mmio spte. */ - if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && - !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) { + if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && + !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; @@ -928,20 +911,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva)) + if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; - r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn, - map_writable, prefault); + r = FNAME(fetch)(vcpu, fault, &walker); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(fault->pfn); return r; } @@ -1007,10 +989,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) sizeof(pt_element_t))) break; - FNAME(update_pte)(vcpu, sp, sptep, &gpte); + FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false); } - if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) + if (!sp->unsync_children) break; } write_unlock(&vcpu->kvm->mmu_lock); @@ -1066,14 +1048,19 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. + * + * Returns + * < 0: the sp should be zapped + * 0: the sp is synced and no tlb flushing is required + * > 0: the sp is synced and tlb flushing is required */ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base; - int i, nr_present = 0; + int i; bool host_writable; gpa_t first_pte_gpa; - int set_spte_ret = 0; + bool flush = false; /* * Ignore various flags when verifying that it's safe to sync a shadow @@ -1098,11 +1085,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) */ if (WARN_ON_ONCE(sp->role.direct || (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word)) - return 0; + return -1; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + u64 *sptep, spte; + struct kvm_memory_slot *slot; unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; @@ -1115,10 +1104,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return 0; + return -1; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } @@ -1127,30 +1116,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); - if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, - &nr_present)) + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) continue; if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); - set_spte_ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; + flush = true; continue; } - nr_present++; - - host_writable = sp->spt[i] & shadow_host_writable_mask; + sptep = &sp->spt[i]; + spte = *sptep; + host_writable = spte & shadow_host_writable_mask; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + make_spte(vcpu, sp, slot, pte_access, gfn, + spte_to_pfn(spte), spte, true, false, + host_writable, &spte); - set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PG_LEVEL_4K, - gfn, spte_to_pfn(sp->spt[i]), - true, false, host_writable); + flush |= mmu_spte_update(sptep, spte); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) - kvm_flush_remote_tlbs(vcpu->kvm); - - return nr_present; + return flush; } #undef pt_element_t diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 3e97cdb13eb7..0c76c45fdb68 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -89,15 +89,17 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) E820_TYPE_RAM); } -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte) +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte) { + int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; - int ret = 0; + bool wrprot = false; - if (ad_disabled) + if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK; @@ -109,7 +111,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * read access. See FNAME(gpte_access) in paging_tmpl.h. */ spte |= shadow_present_mask; - if (!speculative) + if (!prefetch) spte |= spte_shadow_accessed_mask(spte); if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && @@ -150,7 +152,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ - if (!can_unsync && is_writable_pte(old_spte)) + if (is_writable_pte(old_spte)) goto out; /* @@ -159,10 +161,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ - if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) { + if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret |= SET_SPTE_WRITE_PROTECTED_PT; + wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } @@ -171,16 +173,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); - if (speculative) +out: + if (prefetch) spte = mark_spte_for_access_track(spte); -out: WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); + if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON(level > PG_LEVEL_4K); + mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); + } + *new_spte = spte; - return ret; + return wrprot; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index eb7b227fc6cf..cc432f9a966b 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, u64 spte, int level) { - /* - * Use a bitwise-OR instead of a logical-OR to aggregate the reserved - * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc - * (this is extremely unlikely to be short-circuited as true). - */ - return __is_bad_mt_xwr(rsvd_check, spte) | + return __is_bad_mt_xwr(rsvd_check, spte) || __is_rsvd_bits_set(rsvd_check, spte, level); } @@ -334,15 +329,11 @@ static inline u64 get_mmio_spte_generation(u64 spte) return gen; } -/* Bits which may be returned by set_spte() */ -#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) -#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) -#define SET_SPTE_SPURIOUS BIT(2) - -int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level, - gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative, - bool can_unsync, bool host_writable, bool ad_disabled, - u64 *new_spte); +bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct kvm_memory_slot *slot, + unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, + u64 old_spte, bool prefetch, bool can_unsync, + bool host_writable, u64 *new_spte); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access); u64 mark_spte_for_access_track(u64 spte); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 64ccfc1fa553..a54c3491af42 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -167,6 +167,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, role.direct = true; role.gpte_is_8_bytes = true; role.access = ACC_ALL; + role.ad_disabled = !shadow_accessed_mask; return role; } @@ -489,8 +490,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, } /* - * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically - * and handle the associated bookkeeping, but do not mark the page dirty + * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically + * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * @kvm: kvm instance @@ -499,9 +500,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, * Returns: true if the SPTE was set, false if it was not. If false is returned, * this function will have no side-effects. */ -static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, + struct tdp_iter *iter, + u64 new_spte) { lockdep_assert_held_read(&kvm->mmu_lock); @@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -/* - * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a - * TDP page fault. - * - * @vcpu: The vcpu instance that took the TDP page fault. - * @iter: a tdp_iter instance currently on the SPTE that should be set - * @new_spte: The value the SPTE should be set to - * - * Returns: true if the SPTE was set, false if it was not. If false is returned, - * this function will have no side-effects. - */ -static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, - struct tdp_iter *iter, - u64 new_spte) -{ - struct kvm *kvm = vcpu->kvm; - - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) - return false; - - /* - * Use kvm_vcpu_gfn_to_memslot() instead of going through - * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. - */ - if (is_writable_pte(new_spte)) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); - - if (slot && kvm_slot_dirty_track_enabled(slot)) { - /* Enforced by kvm_mmu_hugepage_adjust. */ - WARN_ON_ONCE(iter->level > PG_LEVEL_4K); - mark_page_dirty_in_slot(kvm, slot, iter->gfn); - } - } - - return true; -} - static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { @@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -929,26 +893,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ -static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, - int map_writable, - struct tdp_iter *iter, - kvm_pfn_t pfn, bool prefault) +static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + struct tdp_iter *iter) { + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; - int make_spte_ret = 0; + bool wrprot = false; - if (unlikely(is_noslot_pfn(pfn))) + WARN_ON(sp->role.level != fault->goal_level); + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else - make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, - pfn, iter->old_spte, prefault, true, - map_writable, !shadow_accessed_mask, - &new_spte); + wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, + fault->pfn, iter->old_spte, fault->prefetch, true, + fault->map_writable, &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) + else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) return RET_PF_RETRY; /* @@ -956,10 +920,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * protected, emulation is needed. If the emulation was skipped, * the vCPU would have the same fault again. */ - if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { - if (write) + if (wrprot) { + if (fault->write) ret = RET_PF_EMULATE; - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ @@ -986,37 +949,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault) +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); - bool write = error_code & PFERR_WRITE_MASK; - bool exec = error_code & PFERR_FETCH_MASK; - bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; struct kvm_mmu *mmu = vcpu->arch.mmu; struct tdp_iter iter; struct kvm_mmu_page *sp; u64 *child_pt; u64 new_spte; int ret; - gfn_t gfn = gpa >> PAGE_SHIFT; - int level; - int req_level; - level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, - huge_page_disallowed, &req_level); + kvm_mmu_hugepage_adjust(vcpu, fault); - trace_kvm_mmu_spte_requested(gpa, level, pfn); + trace_kvm_mmu_spte_requested(fault); rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { - if (nx_huge_page_workaround_enabled) - disallowed_hugepage_adjust(iter.old_spte, gfn, - iter.level, &pfn, &level); + tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { + if (fault->nx_huge_page_workaround_enabled) + disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == level) + if (iter.level == fault->goal_level) break; /* @@ -1052,10 +1004,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) { tdp_mmu_link_page(vcpu->kvm, sp, - huge_page_disallowed && - req_level >= iter.level); + fault->huge_page_disallowed && + fault->req_level >= iter.level); trace_kvm_mmu_get_page(sp, true); } else { @@ -1065,13 +1017,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, } } - if (iter.level != level) { + if (iter.level != fault->goal_level) { rcu_read_unlock(); return RET_PF_RETRY; } - ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, - pfn, prefault); + ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); rcu_read_unlock(); return ret; @@ -1241,8 +1192,7 @@ retry: new_spte = iter.old_spte & ~PT_WRITABLE_MASK; - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. @@ -1310,8 +1260,7 @@ retry: continue; } - if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, - new_spte)) { + if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) { /* * The iter must explicitly re-read the SPTE because * the atomic cmpxchg failed. diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 358f447d4012..476b133544dd 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm); void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm); void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm); -int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, - int map_writable, int max_level, kvm_pfn_t pfn, - bool prefault); +int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush); @@ -92,7 +90,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) @@ -114,7 +111,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu) #else static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; } static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {} -static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; } static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; } static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; } #endif diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0772bad9165c..09873f6488f7 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -319,7 +319,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0e4f2b1fa9fb..59d6b76203d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); - int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); + bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -149,7 +149,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8052d92069e0..affc0ea98d30 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -904,7 +904,8 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC); + BIT(APICV_INHIBIT_REASON_X2APIC) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 510b833cbd39..f8b7bc04b3e7 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -238,6 +238,18 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } +static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl) +{ + /* Nested FLUSHBYASID is not supported yet. */ + switch(tlb_ctl) { + case TLB_CONTROL_DO_NOTHING: + case TLB_CONTROL_FLUSH_ALL_ASID: + return true; + default: + return false; + } +} + static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, struct vmcb_control_area *control) { @@ -257,6 +269,9 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, IOPM_SIZE))) return false; + if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl))) + return false; + return true; } @@ -538,8 +553,17 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu); - svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset = - vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + vcpu->arch.l1_tsc_offset, + svm->nested.ctl.tsc_offset, + svm->tsc_ratio_msr); + + svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; + + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + nested_svm_update_tsc_ratio_msr(vcpu); + } svm->vmcb->control.int_ctl = (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | @@ -550,9 +574,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) svm->vmcb->control.event_inj = svm->nested.ctl.event_inj; svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err; - svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count; - svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh; - nested_svm_transition_tlb_flush(vcpu); /* Enter Guest-Mode */ @@ -810,11 +831,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.event_inj = svm->nested.ctl.event_inj; vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; - vmcb12->control.pause_filter_count = - svm->vmcb->control.pause_filter_count; - vmcb12->control.pause_filter_thresh = - svm->vmcb->control.pause_filter_thresh; - nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); svm_switch_vmcb(svm, &svm->vmcb01); @@ -832,6 +848,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } + if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) { + WARN_ON(!svm->tsc_scaling_enabled); + vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); + } + svm->nested.ctl.nested_cr3 = 0; /* @@ -1219,6 +1241,16 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + vcpu->arch.tsc_scaling_ratio = + kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio, + svm->tsc_ratio_msr); + svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio); +} + static int svm_get_nested_state(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, u32 user_data_size) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fdf587f19c5f..871c426ec389 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -181,14 +181,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); idx &= ~(3u << 30); - return (idx >= pmu->nr_arch_gp_counters); + return idx < pmu->nr_arch_gp_counters; } /* idx is the ECX register of RDPMC instruction */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index c36b5fe4c27c..902c52a8dd0c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -17,10 +17,10 @@ #include <linux/misc_cgroup.h> #include <linux/processor.h> #include <linux/trace_events.h> -#include <asm/fpu/internal.h> #include <asm/pkru.h> #include <asm/trapnr.h> +#include <asm/fpu/xcr.h> #include "x86.h" #include "svm.h" @@ -120,16 +120,26 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) return true; } +static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + return misc_cg_try_charge(type, sev->misc_cg, 1); +} + +static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + misc_cg_uncharge(type, sev->misc_cg, 1); +} + static int sev_asid_new(struct kvm_sev_info *sev) { int asid, min_asid, max_asid, ret; bool retry = true; - enum misc_res_type type; - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); - ret = misc_cg_try_charge(type, sev->misc_cg, 1); + ret = sev_misc_cg_try_charge(sev); if (ret) { put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; @@ -162,7 +172,7 @@ again: return asid; e_uncharge: - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; return ret; @@ -179,7 +189,6 @@ static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; int cpu; - enum misc_res_type type; mutex_lock(&sev_bitmap_lock); @@ -192,8 +201,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) mutex_unlock(&sev_bitmap_lock); - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; } @@ -590,7 +598,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) * traditional VMSA as it has been built so far (in prep * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. */ - memcpy(svm->vmsa, save, sizeof(*save)); + memcpy(svm->sev_es.vmsa, save, sizeof(*save)); return 0; } @@ -612,13 +620,18 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, * the VMSA memory content (i.e it will write the same memory region * with the guest's key), so invalidate it first. */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; - vmsa.address = __sme_pa(svm->vmsa); + vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; - return sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); + if (ret) + return ret; + + vcpu->arch.guest_state_protected = true; + return 0; } static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp) @@ -1479,6 +1492,13 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) goto e_free_trans; } + /* + * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP + * encrypts the written data with the guest's key, and the cache may + * contain dirty, unencrypted data. + */ + sev_clflush_pages(guest_page, n); + /* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */ data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset; data.guest_address |= sev_me_mask; @@ -1524,6 +1544,201 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } +static int sev_lock_for_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + /* + * Bail if this VM is already involved in a migration to avoid deadlock + * between two VMs trying to migrate to/from each other. + */ + if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + return -EBUSY; + + mutex_lock(&kvm->lock); + + return 0; +} + +static void sev_unlock_after_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + mutex_unlock(&kvm->lock); + atomic_set_release(&sev->migration_in_progress, 0); +} + + +static int sev_lock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i, j; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mutex_lock_killable(&vcpu->mutex)) + goto out_unlock; + } + + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} + +static void sev_unlock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_unlock(&vcpu->mutex); + } +} + +static void sev_migrate_from(struct kvm_sev_info *dst, + struct kvm_sev_info *src) +{ + dst->active = true; + dst->asid = src->asid; + dst->handle = src->handle; + dst->pages_locked = src->pages_locked; + + src->asid = 0; + src->active = false; + src->handle = 0; + src->pages_locked = 0; + + INIT_LIST_HEAD(&dst->regions_list); + list_replace_init(&src->regions_list, &dst->regions_list); +} + +static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) +{ + int i; + struct kvm_vcpu *dst_vcpu, *src_vcpu; + struct vcpu_svm *dst_svm, *src_svm; + + if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) + return -EINVAL; + + kvm_for_each_vcpu(i, src_vcpu, src) { + if (!src_vcpu->arch.guest_state_protected) + return -EINVAL; + } + + kvm_for_each_vcpu(i, src_vcpu, src) { + src_svm = to_svm(src_vcpu); + dst_vcpu = kvm_get_vcpu(dst, i); + dst_svm = to_svm(dst_vcpu); + + /* + * Transfer VMSA and GHCB state to the destination. Nullify and + * clear source fields as appropriate, the state now belongs to + * the destination. + */ + memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); + dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; + dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; + dst_vcpu->arch.guest_state_protected = true; + + memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); + src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; + src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; + src_vcpu->arch.guest_state_protected = false; + } + to_kvm_svm(src)->sev_info.es_active = false; + to_kvm_svm(dst)->sev_info.es_active = true; + + return 0; +} + +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *src_sev, *cg_cleanup_sev; + struct file *source_kvm_file; + struct kvm *source_kvm; + bool charged = false; + int ret; + + ret = sev_lock_for_migration(kvm); + if (ret) + return ret; + + if (sev_guest(kvm)) { + ret = -EINVAL; + goto out_unlock; + } + + source_kvm_file = fget(source_fd); + if (!file_is_kvm(source_kvm_file)) { + ret = -EBADF; + goto out_fput; + } + + source_kvm = source_kvm_file->private_data; + ret = sev_lock_for_migration(source_kvm); + if (ret) + goto out_fput; + + if (!sev_guest(source_kvm)) { + ret = -EINVAL; + goto out_source; + } + + src_sev = &to_kvm_svm(source_kvm)->sev_info; + dst_sev->misc_cg = get_current_misc_cg(); + cg_cleanup_sev = dst_sev; + if (dst_sev->misc_cg != src_sev->misc_cg) { + ret = sev_misc_cg_try_charge(dst_sev); + if (ret) + goto out_dst_cgroup; + charged = true; + } + + ret = sev_lock_vcpus_for_migration(kvm); + if (ret) + goto out_dst_cgroup; + ret = sev_lock_vcpus_for_migration(source_kvm); + if (ret) + goto out_dst_vcpu; + + if (sev_es_guest(source_kvm)) { + ret = sev_es_migrate_from(kvm, source_kvm); + if (ret) + goto out_source_vcpu; + } + sev_migrate_from(dst_sev, src_sev); + kvm_vm_dead(source_kvm); + cg_cleanup_sev = src_sev; + ret = 0; + +out_source_vcpu: + sev_unlock_vcpus_for_migration(source_kvm); +out_dst_vcpu: + sev_unlock_vcpus_for_migration(kvm); +out_dst_cgroup: + /* Operates on the source on success, on the destination on failure. */ + if (charged) + sev_misc_cg_uncharge(cg_cleanup_sev); + put_misc_cg(cg_cleanup_sev->misc_cg); + cg_cleanup_sev->misc_cg = NULL; +out_source: + sev_unlock_after_migration(source_kvm); +out_fput: + if (source_kvm_file) + fput(source_kvm_file); +out_unlock: + sev_unlock_after_migration(kvm); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2026,16 +2241,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); - __free_page(virt_to_page(svm->vmsa)); + sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->sev_es.vmsa)); - if (svm->ghcb_sa_free) - kfree(svm->ghcb_sa); + if (svm->sev_es.ghcb_sa_free) + kfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2061,7 +2276,7 @@ static void dump_ghcb(struct vcpu_svm *svm) static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; /* * The GHCB protocol so far allows for the following data @@ -2081,7 +2296,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 exit_code; /* @@ -2128,7 +2343,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) struct ghcb *ghcb; u64 exit_code = 0; - ghcb = svm->ghcb; + ghcb = svm->sev_es.ghcb; /* Only GHCB Usage code 0 is supported */ if (ghcb->ghcb_usage) @@ -2246,33 +2461,34 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - if (svm->ghcb_sa_free) { + if (svm->sev_es.ghcb_sa_free) { /* * The scratch area lives outside the GHCB, so there is a * buffer that, depending on the operation performed, may * need to be synced, then freed. */ - if (svm->ghcb_sa_sync) { + if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->ghcb), - svm->ghcb_sa, svm->ghcb_sa_len); - svm->ghcb_sa_sync = false; + ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.ghcb_sa, + svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->ghcb_sa); - svm->ghcb_sa = NULL; - svm->ghcb_sa_free = false; + kfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; } - trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); - svm->ghcb = NULL; + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + svm->sev_es.ghcb = NULL; } void pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -2302,7 +2518,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; @@ -2338,7 +2554,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return false; } - scratch_va = (void *)svm->ghcb; + scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); } else { /* @@ -2368,12 +2584,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) * the vCPU next time (i.e. a read was requested so the data * must be written back to the guest memory). */ - svm->ghcb_sa_sync = sync; - svm->ghcb_sa_free = true; + svm->sev_es.ghcb_sa_sync = sync; + svm->sev_es.ghcb_sa_free = true; } - svm->ghcb_sa = scratch_va; - svm->ghcb_sa_len = len; + svm->sev_es.ghcb_sa = scratch_va; + svm->sev_es.ghcb_sa_len = len; return true; } @@ -2492,15 +2708,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) return -EINVAL; } - if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); return -EINVAL; } - svm->ghcb = svm->ghcb_map.hva; - ghcb = svm->ghcb_map.hva; + svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; + ghcb = svm->sev_es.ghcb_map.hva; trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); @@ -2523,7 +2739,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) @@ -2532,7 +2748,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); @@ -2579,11 +2795,21 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) { - if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2)) + int count; + int bytes; + + if (svm->vmcb->control.exit_info_2 > INT_MAX) + return -EINVAL; + + count = svm->vmcb->control.exit_info_2; + if (unlikely(check_mul_overflow(count, size, &bytes))) + return -EINVAL; + + if (!setup_vmgexit_scratch(svm, in, bytes)) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, - svm->ghcb_sa, svm->ghcb_sa_len, in); + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, + count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) @@ -2598,7 +2824,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) * VMCB page. Do not include the encryption mask on the VMSA physical * address since hardware will access it using the guest key. */ - svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2631,11 +2857,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); } -void sev_es_create_vcpu(struct vcpu_svm *svm) +void sev_es_vcpu_reset(struct vcpu_svm *svm) { /* - * Set the GHCB MSR value as per the GHCB specification when creating - * a vCPU for an SEV-ES guest. + * Set the GHCB MSR value as per the GHCB specification when emulating + * vCPU RESET for an SEV-ES guest. */ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX, GHCB_VERSION_MIN, @@ -2670,8 +2896,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) struct vcpu_svm *svm = to_svm(vcpu); /* First SIPI: Use the values as initially set by the VMM */ - if (!svm->received_first_sipi) { - svm->received_first_sipi = true; + if (!svm->sev_es.received_first_sipi) { + svm->sev_es.received_first_sipi = true; return; } @@ -2680,8 +2906,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a * non-zero value. */ - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - ghcb_set_sw_exit_info_2(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 989685098b3e..5630c241d5f6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -25,6 +25,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/rwsem.h> +#include <linux/cc_platform.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -36,6 +37,7 @@ #include <asm/spec-ctrl.h> #include <asm/cpu_device_id.h> #include <asm/traps.h> +#include <asm/fpu/api.h> #include <asm/virtext.h> #include "trace.h" @@ -186,6 +188,13 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable LBR virtualization */ +static int lbrv = true; +module_param(lbrv, int, 0444); + +static int tsc_scaling = true; +module_param(tsc_scaling, int, 0444); + /* * enable / disable AVIC. Because the defaults differ for APICv * support between VMX and SVM we cannot use module_param_named. @@ -455,7 +464,7 @@ static int has_svm(void) return 0; } - if (sev_active()) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { pr_info("KVM is unsupported when running as an SEV guest\n"); return 0; } @@ -466,7 +475,7 @@ static int has_svm(void) static void svm_hardware_disable(void) { /* Make sure we clean up behind us */ - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + if (tsc_scaling) wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); cpu_svm_disable(); @@ -509,6 +518,10 @@ static int svm_hardware_enable(void) wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + /* + * Set the default value, even if we don't use TSC scaling + * to avoid having stale value in the msr + */ wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); } @@ -929,6 +942,9 @@ static __init void svm_set_cpu_caps(void) if (npt_enabled) kvm_cpu_cap_set(X86_FEATURE_NPT); + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + /* Nested VM can receive #VMEXIT instead of triggering #GP */ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); } @@ -976,10 +992,15 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); - if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } } tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); @@ -1059,6 +1080,13 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + svm_set_cpu_caps(); /* @@ -1109,7 +1137,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) { - return kvm_default_tsc_scaling_ratio; + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->tsc_ratio_msr; } static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1121,7 +1151,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) { wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); } @@ -1150,6 +1180,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, } } +static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (guest_cpuid_is_intel(vcpu)) { + /* + * We must intercept SYSENTER_EIP and SYSENTER_ESP + * accesses because the processor only stores 32 bits. + * For the same reason we cannot use virtual VMLOAD/VMSAVE. + */ + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); + } else { + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + /* No need to intercept these MSRs */ + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); + set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + } +} + static void init_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1296,11 +1358,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu) } svm_hv_init_vmcb(svm->vmcb); + init_vmcb_after_set_cpuid(vcpu); vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); +} +static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm_vcpu_init_msrpm(vcpu, svm->msrpm); + + svm_init_osvw(vcpu); + vcpu->arch.microcode_version = 0x01000065; + svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; + + if (sev_es_guest(vcpu->kvm)) + sev_es_vcpu_reset(svm); } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1311,6 +1387,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) svm->virt_spec_ctrl = 0; init_vmcb(vcpu); + + if (!init_event) + __svm_vcpu_reset(vcpu); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1346,10 +1425,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* * SEV-ES guests maintain an encrypted version of their FPU * state which is restored and saved on VMRUN and VMEXIT. - * Free the fpu structure to prevent KVM from attempting to - * access the FPU state. + * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't + * do xsave/xrstor on it. */ - kvm_free_guest_fpu(vcpu); + fpstate_set_confidential(&vcpu->arch.guest_fpu); } err = avic_init_vcpu(svm); @@ -1370,24 +1449,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->vmcb01.ptr = page_address(vmcb01_page); svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); + svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) - svm->vmsa = page_address(vmsa_page); + svm->sev_es.vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; - svm_switch_vmcb(svm, &svm->vmcb01); - init_vmcb(vcpu); - - svm_vcpu_init_msrpm(vcpu, svm->msrpm); - - svm_init_osvw(vcpu); - vcpu->arch.microcode_version = 0x01000065; - - if (sev_es_guest(vcpu->kvm)) - /* Perform SEV-ES specific VMCB creation updates */ - sev_es_create_vcpu(svm); - return 0; error_free_vmsa_page: @@ -1447,7 +1515,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) vmsave(__sme_page_pa(sd->save_area)); } - if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (tsc_scaling) { u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { __this_cpu_write(current_tsc_ratio, tsc_ratio); @@ -2657,6 +2725,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) struct vcpu_svm *svm = to_svm(vcpu); switch (msr_info->index) { + case MSR_AMD64_TSC_RATIO: + if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) + return 1; + msr_info->data = svm->tsc_ratio_msr; + break; case MSR_STAR: msr_info->data = svm->vmcb01.ptr->save.star; break; @@ -2762,11 +2835,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb)) + if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->ghcb, 1); - ghcb_set_sw_exit_info_2(svm->ghcb, + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, X86_TRAP_GP | SVM_EVTINJ_TYPE_EXEPT | SVM_EVTINJ_VALID); @@ -2806,6 +2879,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) u32 ecx = msr->index; u64 data = msr->data; switch (ecx) { + case MSR_AMD64_TSC_RATIO: + if (!msr->host_initiated && !svm->tsc_scaling_enabled) + return 1; + + if (data & TSC_RATIO_RSVD) + return 1; + + svm->tsc_ratio_msr = data; + + if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) + nested_svm_update_tsc_ratio_msr(vcpu); + + break; case MSR_IA32_CR_PAT: if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) return 1; @@ -2918,7 +3004,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->tsc_aux = data; break; case MSR_IA32_DEBUGCTLMSR: - if (!boot_cpu_has(X86_FEATURE_LBRV)) { + if (!lbrv) { vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", __func__, data); break; @@ -3035,11 +3121,6 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return kvm_handle_invpcid(vcpu, type, gva); } @@ -3278,11 +3359,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) return svm_exit_handlers[exit_code](vcpu); } -static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; + *reason = control->exit_code; *info1 = control->exit_info_1; *info2 = control->exit_info_2; *intr_info = control->exit_int_info; @@ -3299,7 +3382,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_exit(vcpu, KVM_ISA_SVM); /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { @@ -3312,7 +3395,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) if (is_guest_mode(vcpu)) { int vmexit; - trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3780,8 +3863,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); - WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); - sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4001,6 +4082,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); + svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); + svm_recalc_instruction_intercepts(vcpu, svm); /* For sev guests, the memory encryption bit is not reserved in CR3. */ @@ -4027,33 +4110,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_NESTED); } - - if (guest_cpuid_is_intel(vcpu)) { - /* - * We must intercept SYSENTER_EIP and SYSENTER_ESP - * accesses because the processor only stores 32 bits. - * For the same reason we cannot use virtual VMLOAD/VMSAVE. - */ - svm_set_intercept(svm, INTERCEPT_VMLOAD); - svm_set_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); - } else { - /* - * If hardware supports Virtual VMLOAD VMSAVE then enable it - * in VMCB and clear intercepts to avoid #VMEXIT. - */ - if (vls) { - svm_clr_intercept(svm, INTERCEPT_VMLOAD); - svm_clr_intercept(svm, INTERCEPT_VMSAVE); - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; - } - /* No need to intercept these MSRs */ - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); - set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); - } + init_vmcb_after_set_cpuid(vcpu); } static bool svm_has_wbinvd_exit(void) @@ -4520,6 +4577,8 @@ static int svm_vm_init(struct kvm *kvm) } static struct kvm_x86_ops svm_x86_ops __initdata = { + .name = "kvm_amd", + .hardware_unsetup = svm_hardware_teardown, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, @@ -4637,6 +4696,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_unreg_region = svm_unregister_enc_region, .vm_copy_enc_context_from = svm_vm_copy_asid_from, + .vm_move_enc_context_from = svm_vm_migrate_from, .can_emulate_instruction = svm_can_emulate_instruction, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 128a54b1fbf1..437e68504e66 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -80,6 +80,7 @@ struct kvm_sev_info { u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; }; struct kvm_svm { @@ -123,6 +124,20 @@ struct svm_nested_state { bool initialized; }; +struct vcpu_sev_es_state { + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u32 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */ @@ -140,6 +155,8 @@ struct vcpu_svm { u64 next_rip; u64 spec_ctrl; + + u64 tsc_ratio_msr; /* * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be * translated into the appropriate L2_CFG bits on the host to @@ -160,7 +177,8 @@ struct vcpu_svm { unsigned long int3_rip; /* cached guest cpuid flags for faster access */ - bool nrips_enabled : 1; + bool nrips_enabled : 1; + bool tsc_scaling_enabled : 1; u32 ldr_reg; u32 dfr_reg; @@ -183,17 +201,7 @@ struct vcpu_svm { DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; - /* SEV-ES support */ - struct vmcb_save_area *vmsa; - struct ghcb *ghcb; - struct kvm_host_map ghcb_map; - bool received_first_sipi; - - /* SEV-ES scratch area support */ - void *ghcb_sa; - u64 ghcb_sa_len; - bool ghcb_sa_sync; - bool ghcb_sa_free; + struct vcpu_sev_es_state sev_es; bool guest_state_loaded; }; @@ -218,12 +226,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); void recalc_intercepts(struct vcpu_svm *svm); -static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) +static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); } -static inline bool sev_guest(struct kvm *kvm) +static __always_inline bool sev_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -234,7 +242,7 @@ static inline bool sev_guest(struct kvm *kvm) #endif } -static inline bool sev_es_guest(struct kvm *kvm) +static __always_inline bool sev_es_guest(struct kvm *kvm) { #ifdef CONFIG_KVM_AMD_SEV struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -271,7 +279,7 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) return !test_bit(bit, (unsigned long *)&vmcb->control.clean); } -static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) +static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_svm, vcpu); } @@ -483,6 +491,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); int nested_svm_exit_special(struct vcpu_svm *svm); +void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu); +void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier); void nested_load_control_from_vmcb12(struct vcpu_svm *svm, struct vmcb_control_area *control); void nested_sync_control_from_vmcb02(struct vcpu_svm *svm); @@ -553,6 +563,7 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); @@ -562,7 +573,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu); int sev_handle_vmgexit(struct kvm_vcpu *vcpu); int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); -void sev_es_create_vcpu(struct vcpu_svm *svm); +void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu); void sev_es_unmap_ghcb(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 22e2b019de37..9430d6437c9f 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid) * VMSAVE, VMLOAD, etc... is still controlled by the effective address size, * hence 'unsigned long' instead of 'hpa_t'. */ -static inline void vmsave(unsigned long pa) +static __always_inline void vmsave(unsigned long pa) { svm_asm1(vmsave, "a" (pa), "memory"); } -static inline void vmload(unsigned long pa) +static __always_inline void vmload(unsigned long pa) { svm_asm1(vmload, "a" (pa), "memory"); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 03ebe368333e..953b0fcb21ee 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -288,8 +288,8 @@ TRACE_EVENT(kvm_apic, #define TRACE_EVENT_KVM_EXIT(name) \ TRACE_EVENT(name, \ - TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \ - TP_ARGS(exit_reason, vcpu, isa), \ + TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \ + TP_ARGS(vcpu, isa), \ \ TP_STRUCT__entry( \ __field( unsigned int, exit_reason ) \ @@ -303,11 +303,12 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->exit_reason = exit_reason; \ __entry->guest_rip = kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ - static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \ + static_call(kvm_x86_get_exit_info)(vcpu, \ + &__entry->exit_reason, \ + &__entry->info1, \ &__entry->info2, \ &__entry->intr_info, \ &__entry->error_code); \ diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 152ab0aa82cf..16731d2cf231 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -static inline void evmcs_write64(unsigned long field, u64 value) +static __always_inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; int offset = get_evmcs_offset(field, &clean_field); @@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr) __init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf); #else /* !IS_ENABLED(CONFIG_HYPERV) */ -static inline void evmcs_write64(unsigned long field, u64 value) {} +static __always_inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} static inline void evmcs_write16(unsigned long field, u16 value) {} static inline u64 evmcs_read64(unsigned long field) { return 0; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index eedcebf58004..b213ca966d41 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) * failValid writes the error number to the current VMCS, which * can't be done if there isn't a current VMCS. */ - if (vmx->nested.current_vmptr == -1ull && + if (vmx->nested.current_vmptr == INVALID_GPA && !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) return nested_vmx_failInvalid(vcpu); @@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high) static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); vmx->nested.need_vmcs12_to_shadow_sync = false; } @@ -290,9 +290,10 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; free_vpid(vmx->nested.vpid02); vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; if (enable_shadow_vmcs) { vmx_disable_shadow_vmcs(vmx); vmcs_clear(vmx->vmcs01.shadow_vmcs); @@ -524,67 +525,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, } /* - * Check if MSR is intercepted for L01 MSR bitmap. + * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 + * itself utilizing x2APIC. All MSRs were previously set to be intercepted, + * only the "disable intercept" case needs to be handled. */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int type) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); + if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } + if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) @@ -599,6 +552,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) } } +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ +static inline \ +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ + unsigned long *msr_bitmap_l1, \ + unsigned long *msr_bitmap_l0, u32 msr) \ +{ \ + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ + else \ + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ +} +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) + +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int types) +{ + if (types & MSR_TYPE_R) + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); + if (types & MSR_TYPE_W) + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -606,10 +587,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -624,7 +606,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for - * the x2APIC MSR range and selectively disable them below. + * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); @@ -643,61 +625,44 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ + /* + * Always check vmcs01's bitmap to honor userspace MSR filters and any + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. + */ #ifdef CONFIG_X86_64 - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); #endif + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - /* - * Checking the L0->L1 bitmap is trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. - */ - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); - - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); return true; } @@ -709,7 +674,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; shadow = get_shadow_vmcs12(vcpu); @@ -727,7 +692,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); if (!nested_cpu_has_shadow_vmcs(vmcs12) || - vmcs12->vmcs_link_pointer == -1ull) + vmcs12->vmcs_link_pointer == INVALID_GPA) return; kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, @@ -1994,7 +1959,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( } if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; nested_release_evmcs(vcpu); @@ -2178,7 +2143,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) } if (cpu_has_vmx_encls_vmexit()) - vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); + vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); /* * Set the MSR load/store lists to match L0's settings. Only the @@ -2197,7 +2162,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, { prepare_vmcs02_constant_state(vmx); - vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -2949,7 +2914,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, struct vmcs12 *shadow; struct kvm_host_map map; - if (vmcs12->vmcs_link_pointer == -1ull) + if (vmcs12->vmcs_link_pointer == INVALID_GPA) return 0; if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) @@ -3216,7 +3181,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to * force VM-Entry to fail. */ - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); } } @@ -3527,7 +3492,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && - vmx->nested.current_vmptr == -1ull)) + vmx->nested.current_vmptr == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); vmcs12 = get_vmcs12(vcpu); @@ -4975,7 +4940,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vmx->nested.current_vmptr == -1ull) + if (vmx->nested.current_vmptr == INVALID_GPA) return; copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); @@ -4995,7 +4960,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); - vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmptr = INVALID_GPA; } /* Emulate the VMXOFF instruction */ @@ -5090,12 +5055,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMREAD sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); /* Decode instruction info and find the field to read */ @@ -5182,12 +5147,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; /* - * In VMX non-root operation, when the VMCS-link pointer is -1ull, + * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, * any VMWRITE sets the ALU flags for VMfailInvalid. */ - if (vmx->nested.current_vmptr == -1ull || + if (vmx->nested.current_vmptr == INVALID_GPA || (is_guest_mode(vcpu) && - get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) + get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) return nested_vmx_failInvalid(vcpu); if (instr_info & BIT(10)) @@ -5378,7 +5343,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - int i, r; + int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5391,7 +5356,8 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -5458,7 +5424,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; - int r; + int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -5471,7 +5437,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; @@ -5630,7 +5597,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, gpa_t bitmap, last_bitmap; u8 b; - last_bitmap = (gpa_t)-1; + last_bitmap = INVALID_GPA; b = -1; while (size > 0) { @@ -6065,7 +6032,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); /* If L0 (KVM) wants the exit, it trumps L1's desires. */ if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) @@ -6106,8 +6073,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), .hdr.vmx.flags = 0, - .hdr.vmx.vmxon_pa = -1ull, - .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.vmxon_pa = INVALID_GPA, + .hdr.vmx.vmcs12_pa = INVALID_GPA, .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = @@ -6133,7 +6100,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) + vmcs12->vmcs_link_pointer != INVALID_GPA) kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } @@ -6209,7 +6176,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; @@ -6244,11 +6211,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { if (kvm_state->hdr.vmx.smm.flags) return -EINVAL; - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) return -EINVAL; /* @@ -6302,7 +6269,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx_leave_nested(vcpu); - if (kvm_state->hdr.vmx.vmxon_pa == -1ull) + if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) return 0; vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; @@ -6315,13 +6282,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, /* See vmx_has_valid_vmcs12. */ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || - (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) + (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) return -EINVAL; else return 0; } - if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { + if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; @@ -6366,7 +6333,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; if (nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + vmcs12->vmcs_link_pointer != INVALID_GPA) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); if (kvm_state->size < diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 10cc4f65c4ef..1b7456b2177b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -118,16 +118,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); + return fixed ? idx < pmu->nr_arch_fixed_counters + : idx < pmu->nr_arch_gp_counters; } static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, @@ -365,7 +364,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = pmu->global_ctrl; return 0; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = pmu->global_ovf_ctrl; + msr_info->data = 0; return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || @@ -423,7 +422,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & pmu->global_ovf_ctrl_mask)) { if (!msr_info->host_initiated) pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; return 0; } break; @@ -588,8 +586,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; intel_pmu_release_guest_lbr_event(vcpu); } diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 6693ebdc0770..35e7ec91ae86 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -53,11 +53,9 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr, unsigned int size) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = addr; - vcpu->run->internal.data[1] = size; + uint64_t data[2] = { addr, size }; + + __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data)); } static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, @@ -112,9 +110,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr) * but the error code isn't (yet) plumbed through the ENCLS helpers. */ if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -155,9 +151,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu, sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0); sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1); if (!sgx_12_0 || !sgx_12_1) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 116b08904ac3..ba66c171d951 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -35,7 +35,7 @@ #include <asm/cpu_device_id.h> #include <asm/debugreg.h> #include <asm/desc.h> -#include <asm/fpu/internal.h> +#include <asm/fpu/api.h> #include <asm/idtentry.h> #include <asm/io.h> #include <asm/irq_remapping.h> @@ -769,24 +769,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, + MSR_IA32_SPEC_CTRL); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -1059,8 +1048,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { wrmsrl(MSR_IA32_RTIT_CTL, 0); - pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); } } @@ -1070,12 +1059,16 @@ static void pt_guest_exit(struct vcpu_vmx *vmx) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { - pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); - pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); + pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); + pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); } - /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ - wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); + /* + * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, + * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. + */ + if (vmx->pt_desc.host.ctl) + wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); } void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, @@ -1456,16 +1449,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) * cause a #GP fault. */ value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) return 1; value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; - if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) + if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) return 1; return 0; @@ -1886,8 +1879,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; if (!vmx_pt_mode_is_host_guest() || - (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges))) + (index >= 2 * vmx->pt_desc.num_address_ranges)) return 1; if (index % 2) msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; @@ -2202,8 +2194,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!pt_can_write_msr(vmx)) return 1; index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, - PT_CAP_num_address_ranges)) + if (index >= 2 * vmx->pt_desc.num_address_ranges) return 1; if (is_noncanonical_address(data, vcpu)) return 1; @@ -3695,46 +3686,6 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3879,7 +3830,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); - for (i = 0; i < vmx->pt_desc.addr_range; i++) { + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); } @@ -4328,10 +4279,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) #define VMX_XSS_EXIT_BITMAP 0 -/* - * Noting that the initialization of Guest-state Area of VMCS is in - * vmx_vcpu_reset(). - */ static void init_vmcs(struct vcpu_vmx *vmx) { if (nested) @@ -4340,7 +4287,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); - vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ + vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); @@ -4436,10 +4383,40 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx_setup_uret_msrs(vmx); } +static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + init_vmcs(vmx); + + if (nested) + memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); + + vcpu_setup_sgx_lepubkeyhash(vcpu); + + vmx->nested.posted_intr_nv = -1; + vmx->nested.vmxon_ptr = INVALID_GPA; + vmx->nested.current_vmptr = INVALID_GPA; + vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; + + vcpu->arch.microcode_version = 0x100000000ULL; + vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; + + /* + * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR + * or POSTED_INTR_WAKEUP_VECTOR. + */ + vmx->pi_desc.nv = POSTED_INTR_VECTOR; + vmx->pi_desc.sn = 1; +} + static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!init_event) + __vmx_vcpu_reset(vcpu); + vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; @@ -4449,6 +4426,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_set_cr8(vcpu, 0); vmx_segment_cache_clear(vmx); + kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); seg_setup(VCPU_SREG_CS); vmcs_write16(GUEST_CS_SELECTOR, 0xf000); @@ -5379,10 +5357,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (vmx->emulation_required && !vmx->rmode.vm86_active && vcpu->arch.exception.pending) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = - KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5443,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) u64 pcid; u64 gla; } operand; + int gpr_index; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5475,12 +5451,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) @@ -5562,9 +5534,13 @@ static int handle_encls(struct kvm_vcpu *vcpu) static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) { - vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; - vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; - return 0; + /* + * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK + * VM-Exits. Unconditionally set the flag here and leave the handling to + * vmx_handle_exit(). + */ + to_vmx(vcpu)->exit_reason.bus_lock_detected = true; + return 1; } /* @@ -5629,11 +5605,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { static const int kvm_vmx_max_exit_handlers = ARRAY_SIZE(kvm_vmx_exit_handlers); -static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2, +static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, + u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); + *reason = vmx->exit_reason.full; *info1 = vmx_get_exit_qual(vcpu); if (!(vmx->exit_reason.failed_vmentry)) { *info2 = vmx->idt_vectoring_info; @@ -6051,9 +6029,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) int ret = __vmx_handle_exit(vcpu, exit_fastpath); /* - * Even when current exit reason is handled by KVM internally, we - * still need to exit to user space when bus lock detected to inform - * that there is a bus lock in guest. + * Exit to user space when bus lock detected to inform that there is + * a bus lock in guest. */ if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { if (ret > 0) @@ -6302,18 +6279,13 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) /* * If we are running L2 and L1 has a new pending interrupt - * which can be injected, we should re-evaluate - * what should be done with this new L1 interrupt. - * If L1 intercepts external-interrupts, we should - * exit from L2 to L1. Otherwise, interrupt should be - * delivered directly to L2. + * which can be injected, this may cause a vmexit or it may + * be injected into L2. Either way, this interrupt will be + * processed via KVM_REQ_EVENT, not RVI, because we do not use + * virtual interrupt delivery to inject L1 interrupts into L2. */ - if (is_guest_mode(vcpu) && max_irr_updated) { - if (nested_exit_on_intr(vcpu)) - kvm_vcpu_exiting_guest_mode(vcpu); - else - kvm_make_request(KVM_REQ_EVENT, vcpu); - } + if (is_guest_mode(vcpu) && max_irr_updated) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } @@ -6408,6 +6380,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: return nested; case MSR_AMD64_VIRT_SPEC_CTRL: + case MSR_AMD64_TSC_RATIO: /* This is AMD only. */ return false; default: @@ -6722,7 +6695,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); @@ -6784,7 +6757,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) if (likely(!vmx->exit_reason.failed_vmentry)) vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX); + trace_kvm_exit(vcpu, KVM_ISA_VMX); if (unlikely(vmx->exit_reason.failed_vmentry)) return EXIT_FASTPATH_NONE; @@ -6815,7 +6788,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) { struct vmx_uret_msr *tsx_ctrl; struct vcpu_vmx *vmx; - int i, cpu, err; + int i, err; BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); @@ -6836,10 +6809,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vpid; } - for (i = 0; i < kvm_nr_uret_msrs; ++i) { - vmx->guest_uret_msrs[i].data = 0; + for (i = 0; i < kvm_nr_uret_msrs; ++i) vmx->guest_uret_msrs[i].mask = -1ull; - } if (boot_cpu_has(X86_FEATURE_RTM)) { /* * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. @@ -6876,12 +6847,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) } vmx->loaded_vmcs = &vmx->vmcs01; - cpu = get_cpu(); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - init_vmcs(vmx); - vmx_vcpu_put(vcpu); - put_cpu(); + if (cpu_need_virtualize_apic_accesses(vcpu)) { err = alloc_apic_access_page(vcpu->kvm); if (err) @@ -6894,27 +6860,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) goto free_vmcs; } - if (nested) - memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); - else - memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); - - vcpu_setup_sgx_lepubkeyhash(vcpu); - - vmx->nested.posted_intr_nv = -1; - vmx->nested.current_vmptr = -1ull; - vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; - - vcpu->arch.microcode_version = 0x100000000ULL; - vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; - - /* - * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR - * or POSTED_INTR_WAKEUP_VECTOR. - */ - vmx->pi_desc.nv = POSTED_INTR_VECTOR; - vmx->pi_desc.sn = 1; - return 0; free_vmcs: @@ -7129,12 +7074,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) } /* Get the number of configurable Address Ranges for filtering */ - vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, + vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges); /* Initialize and clear the no dependency bits */ vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | - RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); + RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | + RTIT_CTL_BRANCH_EN); /* * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise @@ -7152,12 +7098,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); /* - * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and - * MTCFreq can be set + * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | - RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); + RTIT_CTL_MTC_RANGE); /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) @@ -7177,7 +7122,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; /* unmask address range configure area */ - for (i = 0; i < vmx->pt_desc.addr_range; i++) + for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } @@ -7553,6 +7498,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu) static void hardware_unsetup(void) { + kvm_set_posted_intr_wakeup_handler(NULL); + if (nested) nested_vmx_hardware_unsetup(); @@ -7562,12 +7509,15 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } static struct kvm_x86_ops vmx_x86_ops __initdata = { + .name = "kvm_intel", + .hardware_unsetup = hardware_unsetup, .hardware_enable = hardware_enable, @@ -7881,8 +7831,6 @@ static __init int hardware_setup(void) vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } - kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); - kvm_mce_cap_supported |= MCG_LMCE_P; if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) @@ -7906,6 +7854,9 @@ static __init int hardware_setup(void) r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); + + kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); + return r; } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 592217fd7d92..a4ead6023133 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -62,7 +62,7 @@ struct pt_ctx { struct pt_desc { u64 ctl_bitmask; - u32 addr_range; + u32 num_address_ranges; u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES]; struct pt_ctx host; struct pt_ctx guest; @@ -400,6 +400,34 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +/* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and + * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and + * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and + * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always + * VM-Exit. + */ +#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ +static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int f = sizeof(unsigned long); \ + \ + if (msr <= 0x1fff) \ + return bitop##_bit(msr, bitmap + base / f); \ + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ + return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ + return (rtype)true; \ +} +#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) + +BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) + static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -522,4 +550,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 28) & 0xf; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aabd3a2ec1bc..dc7eb5fddfd3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,7 +68,9 @@ #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> @@ -293,8 +295,6 @@ u64 __read_mostly host_xcr0; u64 __read_mostly supported_xcr0; EXPORT_SYMBOL_GPL(supported_xcr0); -static struct kmem_cache *x86_fpu_cache; - static struct kmem_cache *x86_emulator_cache; /* @@ -790,30 +790,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_GPL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); @@ -825,34 +801,38 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == UNMAPPED_GVA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); vcpu->arch.pdptrs_from_userspace = false; -out: - - return ret; + return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); @@ -993,7 +973,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -1042,9 +1022,28 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + /* + * If any role bit is changed, the MMU needs to be reset. + * + * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed. + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it + * is slow, but changing CR4.PCIDE is a rare case. + * + * If CR4.PGE is changed, the guest TLB must be flushed. + * + * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and + * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence + * the usage of "else if". + */ + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) kvm_mmu_reset_context(vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE) + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + else if ((cr4 ^ old_cr4) & X86_CR4_PGE) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } EXPORT_SYMBOL_GPL(kvm_post_set_cr4); @@ -1092,6 +1091,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) int i; /* + * MOV CR3 and INVPCID are usually not intercepted when using TDP, but + * this is reachable when running EPT=1 and unrestricted_guest=0, and + * also via the emulator. KVM's TDP page tables are not in the scope of + * the invalidation, but the guest's TLB entries need to be flushed as + * the CPU may have cached entries in its TLB for the target PCID. + */ + if (unlikely(tdp_enabled)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. @@ -1101,6 +1112,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } + /* + * If PCID is disabled, there is no need to free prev_roots even if the + * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB + * with PCIDE=0. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); @@ -1381,6 +1400,7 @@ static const u32 emulated_msrs_all[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, @@ -2454,13 +2474,64 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } +/* + * Infers attempts to synchronize the guest's tsc from host writes. Sets the + * offset for the vcpu and tracks the TSC matching generation that the vcpu + * participates in. + */ +static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, + u64 ns, bool matched) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.tsc_write_lock); + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = tsc; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + kvm->arch.last_tsc_offset = offset; + + vcpu->arch.last_guest_tsc = tsc; + + kvm_vcpu_write_tsc_offset(vcpu, offset); + + if (!matched) { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = tsc; + kvm->arch.cur_tsc_offset = offset; + kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { + kvm->arch.nr_vcpus_matched_tsc++; + } + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_track_tsc_matching(vcpu); +} + static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - bool matched; - bool already_matched; + bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2506,51 +2577,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; } - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - kvm_vcpu_write_tsc_offset(vcpu, offset); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } - - kvm_track_tsc_matching(vcpu); - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2738,6 +2768,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) int vclock_mode; bool host_tsc_clocksource, vcpus_matched; + lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -2762,68 +2793,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void __kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - unsigned long flags; - - kvm_hv_invalidate_tsc_page(kvm); + raw_spin_lock_irq(&kvm->arch.tsc_write_lock); + write_seqcount_begin(&kvm->arch.pvclock_sc); +} +static void kvm_start_pvclock_update(struct kvm *kvm) +{ kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); +} +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + int i; + + write_seqcount_end(&ka->pvclock_sc); + raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); -#endif } -u64 get_kvmclock_ns(struct kvm *kvm) +static void kvm_update_masterclock(struct kvm *kvm) +{ + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); +} + +/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ +static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - if (!ka->use_master_clock) { - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - return get_kvmclock_base_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - if (__this_cpu_read(cpu_tsc_khz)) { + data->flags = 0; + if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + + data->flags |= KVM_CLOCK_TSC_STABLE; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + unsigned seq; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + __get_kvmclock(kvm, data); + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); +} + +u64 get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_clock_data data; + + get_kvmclock(kvm, &data); + return data.clock; } static void kvm_setup_pvclock_page(struct kvm_vcpu *v, @@ -2888,6 +2952,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v, static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; + unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -2902,13 +2967,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -3179,15 +3245,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; if (!tdp_enabled) { - /* + /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires - * a forced sync of the shadow page tables. Unload the - * entire MMU here and the subsequent load will sync the - * shadow page tables, and also flush the TLB. + * a forced sync of the shadow page tables. Ensure all the + * roots are synced and the guest TLB in hardware is clean. */ - kvm_mmu_unload(vcpu); - return; + kvm_mmu_sync_roots(vcpu); + kvm_mmu_sync_prev_roots(vcpu); } static_call(kvm_x86_tlb_flush_guest)(vcpu); @@ -3195,8 +3260,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + u64 steal; + u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); @@ -3206,47 +3274,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = xchg(&st->preempted, 0); + u8 st_preempted = 0; + int err = -EFAULT; + + if (!user_access_begin(st, sizeof(*st))) + return; + + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+r" (st_preempted), + "+&r" (err) + : "m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; } else { - st->preempted = 0; - } + if (!user_access_begin(st, sizeof(*st))) + return; - vcpu->arch.st.preempted = 0; + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); - - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3452,7 +3559,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -4028,6 +4135,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: @@ -4048,7 +4156,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | @@ -4071,13 +4179,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -4285,8 +4393,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4294,16 +4404,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) + return; + + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -4700,144 +4817,27 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) - -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - } else { - src = get_xsave_addr(xsave, xfeature_nr); - if (src) - memcpy(dest + offset, src, size); - } - - valid -= xfeature_mask; - } -} - -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - } else { - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) - memcpy(dest, src + offset, size); - } - - valid -= xfeature_mask; - } -} - static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv; - u32 mxcsr; - - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; - xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + supported_xcr0, &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -4892,6 +4892,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) return 0; } +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = -EFAULT; + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) + break; + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr; + struct kvm *kvm = vcpu->kvm; + int r; + + if ((u64)(unsigned long)uaddr != attr->addr) + return -EFAULT; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: { + u64 offset, tsc, ns; + unsigned long flags; + bool matched; + + r = -EFAULT; + if (get_user(offset, uaddr)) + break; + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + + matched = (vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_offset == offset); + + tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + ns = get_kvmclock_base_ns(); + + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + r = 0; + break; + } + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, + unsigned int ioctl, + void __user *argp) +{ + struct kvm_device_attr attr; + int r; + + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + if (attr.group != KVM_VCPU_TSC_CTRL) + return -ENXIO; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + r = kvm_arch_tsc_has_attr(vcpu, &attr); + break; + case KVM_GET_DEVICE_ATTR: + r = kvm_arch_tsc_get_attr(vcpu, &attr); + break; + case KVM_SET_DEVICE_ATTR: + r = kvm_arch_tsc_set_attr(vcpu, &attr); + break; + } + + return r; +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { @@ -5346,6 +5455,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = __set_sregs2(vcpu, u.sregs2); break; } + case KVM_HAS_DEVICE_ATTR: + case KVM_GET_DEVICE_ATTR: + case KVM_SET_DEVICE_ATTR: + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); + break; default: r = -EINVAL; } @@ -5665,6 +5779,12 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (kvm_x86_ops.vm_move_enc_context_from) + r = kvm_x86_ops.vm_move_enc_context_from( + kvm, cap->args[0]); + return r; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -5829,6 +5949,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data = { 0 }; + + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_raw_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) + return -EINVAL; + + kvm_hv_invalidate_tsc_page(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; + else + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6072,60 +6249,12 @@ set_pit2_out: break; } #endif - case KVM_SET_CLOCK: { - struct kvm_arch *ka = &kvm->arch; - struct kvm_clock_data user_ns; - u64 now_ns; - - r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) - goto out; - - r = -EINVAL; - if (user_ns.flags) - goto out; - - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); - - /* - * This pairs with kvm_guest_time_update(): when masterclock is - * in use, we use master_kernel_ns + kvmclock_offset to set - * unsigned 'system_time' so if we use get_kvmclock_ns() (which - * is slightly ahead) here we risk going negative on unsigned - * 'system_time' when 'user_ns.clock' is very small. - */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; - else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); - - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); break; - } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; if (kvm_x86_ops.mem_enc_op) @@ -6906,7 +7035,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, + unsigned short port, unsigned int count, bool in) { vcpu->arch.pio.port = port; @@ -6914,10 +7043,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) return 1; - } vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -6929,26 +7056,39 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) +static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, unsigned int count) { - int ret; + WARN_ON(vcpu->arch.pio.count); + memset(vcpu->arch.pio_data, 0, size * count); + return emulator_pio_in_out(vcpu, size, port, count, true); +} - if (vcpu->arch.pio.count) - goto data_avail; +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; +} - memset(vcpu->arch.pio_data, 0, size * count); +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) +{ + if (vcpu->arch.pio.count) { + /* Complete previous iteration. */ + } else { + int r = __emulator_pio_in(vcpu, size, port, count); + if (!r) + return r; - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; - return 1; + /* Results already available, fall through. */ } - return 0; + WARN_ON(count != vcpu->arch.pio.count); + complete_emulator_pio_in(vcpu, val); + return 1; } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, @@ -6963,9 +7103,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { + int ret; + memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); + ret = emulator_pio_in_out(vcpu, size, port, count, false); + if (ret) + vcpu->arch.pio.count = 0; + + return ret; } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, @@ -7239,7 +7385,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) + return 0; + return -EINVAL; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7475,29 +7623,78 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata, u8 *insn_bytes, u8 insn_size) { - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; struct kvm_run *run = vcpu->run; + u64 info[5]; + u8 info_start; + + /* + * Zero the whole array used to retrieve the exit info, as casting to + * u32 for select entries will leave some chunks uninitialized. + */ + memset(&info, 0, sizeof(info)); + + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], + &info[2], (u32 *)&info[3], + (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; - run->emulation_failure.ndata = 0; + + /* + * There's currently space for 13 entries, but 5 are used for the exit + * reason and info. Restrict to 4 to reduce the maintenance burden + * when expanding kvm_run.emulation_failure in the future. + */ + if (WARN_ON_ONCE(ndata > 4)) + ndata = 4; + + /* Always include the flags as a 'data' entry. */ + info_start = 1; run->emulation_failure.flags = 0; if (insn_size) { - run->emulation_failure.ndata = 3; + BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + + sizeof(run->emulation_failure.insn_bytes) != 16)); + info_start += 2; run->emulation_failure.flags |= KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; run->emulation_failure.insn_size = insn_size; memset(run->emulation_failure.insn_bytes, 0x90, sizeof(run->emulation_failure.insn_bytes)); - memcpy(run->emulation_failure.insn_bytes, - ctxt->fetch.data, insn_size); + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); } + + memcpy(&run->internal.data[info_start], info, sizeof(info)); + memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, + ndata * sizeof(data[0])); + + run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; +} + +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data, + ctxt->fetch.end - ctxt->fetch.data); } +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata) +{ + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); +} +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); + +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; @@ -7512,16 +7709,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) if (kvm->arch.exit_on_emulation_error || (emulation_type & EMULTYPE_SKIP)) { - prepare_emulation_failure_exit(vcpu); + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } kvm_queue_exception(vcpu, UD_VECTOR); if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -8121,14 +8316,13 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; - unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ @@ -8137,18 +8331,11 @@ static void kvm_hyperv_tsc_notifier(void) kvm_max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { - struct kvm_arch *ka = &kvm->arch; - - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -8389,18 +8576,20 @@ int kvm_arch_init(void *opaque) int r; if (kvm_x86_ops.hardware_enable) { - printk(KERN_ERR "kvm: already loaded the other module\n"); + pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); r = -EEXIST; goto out; } if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support for '%s'\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", + ops->runtime_ops->name); r = -EOPNOTSUPP; goto out; } @@ -8417,18 +8606,11 @@ int kvm_arch_init(void *opaque) } r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; - } x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out_free_x86_fpu_cache; + goto out; } user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); @@ -8466,8 +8648,6 @@ out_free_percpu: free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -8494,7 +8674,6 @@ void kvm_arch_exit(void) kvm_mmu_module_exit(); free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); - kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); @@ -8595,7 +8774,7 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { - mutex_init(&kvm->arch.apicv_update_lock); + init_rwsem(&kvm->arch.apicv_update_lock); if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, @@ -8783,9 +8962,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); + + /* + * The call to kvm_ready_for_interrupt_injection() may end up in + * kvm_xen_has_interrupt() which may require the srcu lock to be + * held, to protect against changes in the vcpu_info address. + */ + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; @@ -9242,14 +9429,7 @@ static void process_smi(struct kvm_vcpu *vcpu) void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - NULL, vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -9264,7 +9444,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + down_read(&vcpu->kvm->arch.apicv_update_lock); activate = kvm_apicv_activated(vcpu->kvm); if (vcpu->arch.apicv_active == activate) @@ -9284,7 +9464,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); out: - mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); + up_read(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -9292,6 +9472,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { unsigned long old, new; + lockdep_assert_held_write(&kvm->arch.apicv_update_lock); + if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; @@ -9305,6 +9487,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) if (!!old != !!new) { trace_kvm_apicv_update_request(activate, bit); + /* + * Kick all vCPUs before setting apicv_inhibit_reasons to avoid + * false positives in the sanity check WARN in svm_vcpu_run(). + * This task will wait for all vCPUs to ack the kick IRQ before + * updating apicv_inhibit_reasons, and all other vCPUs will + * block on acquiring apicv_update_lock so that vCPUs can't + * redo svm_vcpu_run() without seeing the new inhibit state. + * + * Note, holding apicv_update_lock and taking it in the read + * side (handling the request) also prevents other vCPUs from + * servicing the request with a stale apicv_inhibit_reasons. + */ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); kvm->arch.apicv_inhibit_reasons = new; if (new) { @@ -9318,9 +9512,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - mutex_lock(&kvm->arch.apicv_update_lock); + down_write(&kvm->arch.apicv_update_lock); __kvm_request_apicv_update(kvm, activate, bit); - mutex_unlock(&kvm->arch.apicv_update_lock); + up_write(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9417,7 +9611,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } @@ -9432,7 +9626,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -9639,18 +9833,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } for (;;) { + /* + * Assert that vCPU vs. VM APICv state is consistent. An APICv + * update must kick and wait for all vCPUs before toggling the + * per-VM state, and responsing vCPUs must wait for the update + * to complete before servicing KVM_REQ_APICV_UPDATE. + */ + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + exit_fastpath = static_call(kvm_x86_run)(vcpu); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (unlikely(kvm_vcpu_exit_request(vcpu))) { + if (vcpu->arch.apicv_active) + static_call(kvm_x86_sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } - - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - } + } /* * Do this here before restoring debug registers on the host. And @@ -9913,58 +10115,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. + * Exclude PKRU from restore as restored separately in + * kvm_x86_ops.run(). */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. - */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -10458,6 +10623,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool inhibit = false; + struct kvm_vcpu *vcpu; + int i; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + inhibit = true; + break; + } + } + __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { @@ -10510,6 +10693,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); + r = 0; out: @@ -10545,12 +10730,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10568,12 +10753,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -10624,33 +10809,6 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_fpu) { - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - vcpu->arch.guest_fpu = NULL; - } -} -EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) @@ -10707,20 +10865,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_emulate_ctxt; - } - - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; + goto free_emulate_ctxt; } - fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -10752,9 +10900,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return 0; free_guest_fpu: - kvm_free_guest_fpu(vcpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); free_emulate_ctxt: kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: @@ -10792,19 +10938,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; int idx; - kvm_release_pfn(cache->pfn, cache->dirty, cache); - kvmclock_reset(vcpu); static_call(kvm_x86_vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kvm_free_guest_fpu(vcpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); @@ -10821,9 +10963,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); unsigned long new_cr0; - u32 eax, dummy; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); kvm_lapic_reset(vcpu, init_event); @@ -10856,8 +11008,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - void *mpx_state_buffer; + if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* * To avoid have the INIT path from kvm_apic_has_events() that be @@ -10865,14 +11017,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + if (init_event) kvm_load_guest_fpu(vcpu); } @@ -10886,21 +11034,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.xcr0 = XFEATURE_MASK_FP; } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); /* * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) * if no CPUID match is found. Note, it's impossible to get a match at * RESET since KVM emulates RESET before exposing the vCPU to userspace, - * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. - * But, go through the motions in case that's ever remedied. + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. */ - eax = 1; - if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) - eax = 0x600; - kvm_rdx_write(vcpu, eax); + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); vcpu->arch.ia32_xss = 0; @@ -11152,13 +11298,14 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_free_vm(struct kvm *kvm) { kfree(to_kvm_hv(kvm)->hv_pa_pg); - vfree(kvm); + __kvm_arch_free_vm(kvm); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; + unsigned long flags; if (type) return -EINVAL; @@ -11182,10 +11329,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - + seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); pvclock_update_vm_gtod_copy(kvm); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); kvm->arch.guest_can_read_msr_platform_info = true; @@ -11382,8 +11531,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) kvm_page_track_free_memslot(slot); } -static int memslot_rmap_alloc(struct kvm_memory_slot *slot, - unsigned long npages) +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) { const int sz = sizeof(*slot->arch.rmap[0]); int i; @@ -11392,7 +11540,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, int level = i + 1; int lpages = __kvm_mmu_slot_lpages(slot, npages, level); - WARN_ON(slot->arch.rmap[i]); + if (slot->arch.rmap[i]) + continue; slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { @@ -11404,50 +11553,6 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -int alloc_all_memslots_rmaps(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - int r, i; - - /* - * Check if memslots alreday have rmaps early before acquiring - * the slots_arch_lock below. - */ - if (kvm_memslots_have_rmaps(kvm)) - return 0; - - mutex_lock(&kvm->slots_arch_lock); - - /* - * Read memslots_have_rmaps again, under the slots arch lock, - * before allocating the rmaps - */ - if (kvm_memslots_have_rmaps(kvm)) { - mutex_unlock(&kvm->slots_arch_lock); - return 0; - } - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { - r = memslot_rmap_alloc(slot, slot->npages); - if (r) { - mutex_unlock(&kvm->slots_arch_lock); - return r; - } - } - } - - /* - * Ensure that memslots_have_rmaps becomes true strictly after - * all the rmap pointers are set. - */ - smp_store_release(&kvm->arch.memslots_have_rmaps, true); - mutex_unlock(&kvm->slots_arch_lock); - return 0; -} - static int kvm_alloc_memslot_metadata(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) @@ -11498,7 +11603,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } - if (kvm_page_track_create_memslot(slot, npages)) + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; @@ -12096,6 +12201,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); } +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); +} + bool kvm_vector_hashing_enabled(void) { return vector_hashing; @@ -12177,9 +12291,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, * doesn't seem to be a real use-case behind such requests, just return * KVM_EXIT_INTERNAL_ERROR for now. */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -12239,7 +12351,8 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return kvm_skip_emulated_instruction(vcpu); default: - BUG(); /* We have already checked above that type <= 3 */ + kvm_inject_gp(vcpu, 0); + return 1; } } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); @@ -12367,44 +12480,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, } EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); -static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) { - memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, - vcpu->arch.pio.count * vcpu->arch.pio.size); - vcpu->arch.pio.count = 0; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); return 1; } static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port) { - int ret; - - ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) - return ret; + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); + + /* memcpy done already by emulator_pio_out. */ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; + if (!ret) + break; - vcpu->arch.pio.count = 0; + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; return 0; } static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port); + +static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { - int ret; + unsigned count = vcpu->arch.pio.count; + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size; +} - ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) { - vcpu->arch.pio.count = 0; - } else { - vcpu->arch.guest_ins_data = data; - vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + advance_sev_es_emulated_ins(vcpu); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!__emulator_pio_in(vcpu, size, port, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_ins(vcpu); + if (!vcpu->arch.sev_pio_count) + return 1; } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; return 0; } @@ -12412,8 +12562,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in) { - return in ? kvm_sev_es_ins(vcpu, size, port, data, count) - : kvm_sev_es_outs(vcpu, size, port, data, count); + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); } EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7d66d63dc55a..ea264c4502e4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -343,8 +343,6 @@ extern bool enable_vmware_backdoor; extern int pi_inject_timer; -extern struct static_key kvm_no_apic_vcpu; - extern bool report_ignored_msrs; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 9ea9c3dabe37..8f62baebd028 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + int err; u8 rc = 0; /* @@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) if (likely(slots->generation == ghc->generation && !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { /* Fast path */ - __get_user(rc, (u8 __user *)ghc->hva + offset); - } else { - /* Slow path */ - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + pagefault_disable(); + err = __get_user(rc, (u8 __user *)ghc->hva + offset); + pagefault_enable(); + if (!err) + return rc; } + /* Slow path */ + + /* + * This function gets called from kvm_vcpu_block() after setting the + * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately + * from a HLT. So we really mustn't sleep. If the page ended up absent + * at that point, just return 1 in order to trigger an immediate wake, + * and we'll end up getting called again from a context where we *can* + * fault in the page and wait for it. + */ + if (in_atomic() || !task_is_running(current)) + return 1; + + kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, + sizeof(rc)); + return rc; } |