diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
| -rw-r--r-- | arch/x86/kvm/x86.c | 2367 |
1 files changed, 1381 insertions, 986 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e713480933a..0c6d899d53dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,7 +59,6 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> -#include <linux/entry-kvm.h> #include <linux/suspend.h> #include <linux/smp.h> @@ -90,7 +89,6 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 /* * Note, kvm_caps fields should *never* have default values, all fields must be @@ -98,10 +96,10 @@ * vendor module being reloaded with different module parameters. */ struct kvm_caps kvm_caps __read_mostly; -EXPORT_SYMBOL_GPL(kvm_caps); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps); struct kvm_host_values kvm_host __read_mostly; -EXPORT_SYMBOL_GPL(kvm_host); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host); #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) @@ -119,8 +117,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; - #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE @@ -139,6 +135,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -155,24 +154,18 @@ module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, 0644); -EXPORT_SYMBOL_GPL(report_ignored_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, 0644); -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, 0444); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, 0644); -static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, 0444); - bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, 0444); -EXPORT_SYMBOL_GPL(enable_vmware_backdoor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor); /* * Flags to manipulate forced emulation behavior (any non-zero value will @@ -187,7 +180,7 @@ module_param(pi_inject_timer, bint, 0644); /* Enable/disable PMU virtualization */ bool __read_mostly enable_pmu = true; -EXPORT_SYMBOL_GPL(enable_pmu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); bool __read_mostly eager_page_split = true; @@ -214,20 +207,35 @@ struct kvm_user_return_msrs { }; u32 __read_mostly kvm_nr_uret_msrs; -EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; -static struct kvm_user_return_msrs __percpu *user_return_msrs; +static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) +/* + * Note, KVM supports exposing PT to the guest, but does not support context + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). + */ +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) + bool __read_mostly allow_smaller_maxphyaddr = 0; -EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; -EXPORT_SYMBOL_GPL(enable_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv); + +bool __read_mostly enable_ipiv = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv); + +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -332,7 +340,11 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, - MSR_IA32_XFD, MSR_IA32_XFD_ERR, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, }; static const u32 msrs_to_save_pmu[] = { @@ -364,6 +376,7 @@ static const u32 msrs_to_save_pmu[] = { MSR_AMD64_PERF_CNTR_GLOBAL_CTL, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -559,28 +572,30 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +static void kvm_destroy_user_return_msrs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); + + kvm_nr_uret_msrs = 0; +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; - unsigned long flags; - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_disable_virtualization_cpu() - */ - local_irq_save(flags); - if (msrs->registered) { - msrs->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); + msrs->registered = false; + user_return_notifier_unregister(urn); + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -592,10 +607,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -611,7 +626,7 @@ int kvm_add_user_return_msr(u32 msr) kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -623,46 +638,57 @@ int kvm_find_user_return_msr(u32 msr) } return -1; } -EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); u64 value; int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); + +u64 kvm_get_user_return_msr(unsigned int slot) +{ + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); static void drop_user_return_notifiers(void) { - struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); if (msrs->registered) kvm_on_user_return(&msrs->urn); @@ -680,7 +706,7 @@ noinstr void kvm_spurious_fault(void) /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -785,7 +811,7 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, ex->has_payload = false; ex->payload = 0; } -EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload); static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, bool has_error_code, u32 error_code, @@ -802,9 +828,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -812,13 +838,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -827,28 +850,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -889,30 +893,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_p); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -922,7 +949,7 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) return 1; } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp); static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -972,7 +999,7 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, fault_mmu->inject_page_fault(vcpu, fault); } -EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -982,15 +1009,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); -} -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); - -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -1012,7 +1033,14 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); + +static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { @@ -1067,7 +1095,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) return 1; } -EXPORT_SYMBOL_GPL(load_pdptrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1106,21 +1134,26 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon } if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - /* * Clearing CR0.PG is defined to flush the TLB from the guest's * perspective. */ if (!(cr0 & X86_CR0_PG)) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* + * Check for async #PF completion events when enabling paging, + * as the vCPU may have previously encountered async #PFs (it's + * entirely legal for the guest to toggle paging on/off without + * waiting for the async #PF queue to drain). + */ + else if (kvm_pv_async_pf_enabled(vcpu)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -1155,44 +1188,53 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) + return 1; + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) { if (vcpu->arch.guest_state_protected) return; - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) + return; - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); - } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); +} + +static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) - write_pkru(vcpu->arch.pkru); + wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; @@ -1202,21 +1244,9 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) - write_pkru(vcpu->arch.host_pkru); - } - - if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { - - if (vcpu->arch.xcr0 != kvm_host.xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0); - - if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && - vcpu->arch.ia32_xss != kvm_host.xss) - wrmsrl(MSR_IA32_XSS, kvm_host.xss); + wrpkru(vcpu->arch.host_pkru); } - } -EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); #ifdef CONFIG_X86_64 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) @@ -1225,7 +1255,7 @@ static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) } #endif -static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -1266,9 +1296,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { @@ -1281,19 +1312,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); - -bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) -{ - if (cr4 & cr4_reserved_bits) - return false; - - if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return false; - - return true; -} -EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv); static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1341,7 +1360,7 @@ void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned lon kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1366,13 +1385,16 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { @@ -1464,7 +1486,7 @@ handle_tlb_flush: return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { @@ -1476,7 +1498,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { @@ -1485,7 +1507,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { @@ -1510,16 +1532,16 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_update_dr7); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } @@ -1551,7 +1573,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { @@ -1568,14 +1590,14 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) return vcpu->arch.dr7; } } -EXPORT_SYMBOL_GPL(kvm_get_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 pmc = kvm_rcx_read(vcpu); u64 data; - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -1584,7 +1606,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); /* * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM @@ -1603,7 +1625,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1637,6 +1659,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1679,7 +1703,7 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(index, data); + rdmsrq_safe(index, data); break; default: return kvm_x86_call(get_feature_msr)(index, data); @@ -1695,20 +1719,20 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) + if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) return false; - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) + if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) return false; - if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && - !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return false; - if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) return false; return true; @@ -1721,7 +1745,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return __kvm_valid_efer(vcpu, efer); } -EXPORT_SYMBOL_GPL(kvm_valid_efer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -1764,7 +1788,7 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { @@ -1807,7 +1831,7 @@ out: return allowed; } -EXPORT_SYMBOL_GPL(kvm_msr_allowed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault @@ -1850,8 +1874,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; /* @@ -1868,6 +1892,44 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, data = (u32)data; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; } msr.data = data; @@ -1896,8 +1958,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1908,10 +1970,24 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; } msr.index = index; @@ -1923,6 +1999,16 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return __kvm_set_msr(vcpu, index, data, true); +} + +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} + static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated) { @@ -1930,33 +2016,36 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, __kvm_get_msr); } -int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); -int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return kvm_get_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_get_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return kvm_set_msr_ignored_check(vcpu, index, data, false); + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); } -EXPORT_SYMBOL_GPL(kvm_set_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); + static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) { @@ -1988,6 +2077,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) return complete_fast_msr_access(vcpu); } +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); +} + static u64 kvm_msr_reason(int r) { switch (r) { @@ -2022,55 +2120,82 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) { - u32 ecx = kvm_rcx_read(vcpu); u64 data; int r; - r = kvm_get_msr_with_filter(vcpu, ecx, &data); + r = kvm_emulate_msr_read(vcpu, msr, &data); if (!r) { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(msr, data); - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); + if (reg < 0) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + kvm_register_write(vcpu, reg, data); + } } else { /* MSR read failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, - complete_fast_rdmsr, r)) + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, + complete_rdmsr, r)) return 0; - trace_kvm_msr_read_ex(ecx); + trace_kvm_msr_read_ex(msr); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); -int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - int r; + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, + complete_fast_rdmsr); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); + +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + vcpu->arch.cui_rdmsr_imm_reg = reg; - r = kvm_set_msr_with_filter(vcpu, ecx, data); + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); + +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int r; + r = kvm_emulate_msr_write(vcpu, msr, data); if (!r) { - trace_kvm_msr_write(ecx, data); + trace_kvm_msr_write(msr, data); } else { /* MSR write failed? See if we should ask user space */ - if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, complete_fast_msr_access, r)) return 0; /* Signal all other negative errors to userspace */ if (r < 0) return r; - trace_kvm_msr_write_ex(ecx, data); + trace_kvm_msr_write_ex(msr, data); } return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); + +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { @@ -2082,22 +2207,41 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu) /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_invd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); + +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } @@ -2105,13 +2249,13 @@ int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); } -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait); int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); } -EXPORT_SYMBOL_GPL(kvm_emulate_monitor); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor); static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { @@ -2121,74 +2265,41 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) - return kvm_x2apic_icr_write(vcpu->arch.apic, data); - - return 1; -} - -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) -{ - if (!kvm_can_use_hv_timer(vcpu)) - return 1; - - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; -} - -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) -{ - u32 msr = kvm_rcx_read(vcpu); - u64 data; - fastpath_t ret; - bool handled; - - kvm_vcpu_srcu_read_lock(vcpu); - switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); - handled = !handle_fastpath_set_tscdeadline(vcpu, data); + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - handled = false; - break; + return EXIT_FASTPATH_NONE; } - if (handled) { - if (!kvm_skip_emulated_instruction(vcpu)) - ret = EXIT_FASTPATH_EXIT_USERSPACE; - else - ret = EXIT_FASTPATH_REENTER_GUEST; - trace_kvm_msr_write(msr, data); - } else { - ret = EXIT_FASTPATH_NONE; - } + trace_kvm_msr_write(msr, data); - kvm_vcpu_srcu_read_unlock(vcpu); + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; - return ret; + return EXIT_FASTPATH_REENTER_GUEST; +} + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); + +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); } -EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); /* * Adapt set_msr() to msr_io()'s calling convention @@ -2554,7 +2665,7 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { @@ -2569,7 +2680,7 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) nested_offset += l2_offset; return nested_offset; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { @@ -2579,10 +2690,13 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) return l1_multiplier; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2640,12 +2754,18 @@ static inline bool kvm_check_tsc_unstable(void) * participates in. */ static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, - u64 ns, bool matched) + u64 ns, bool matched, bool user_set_tsc) { struct kvm *kvm = vcpu->kvm; lockdep_assert_held(&kvm->arch.tsc_write_lock); + if (vcpu->arch.guest_tsc_protected) + return; + + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + /* * We also track th most recent recorded KHZ, write and time to * allow the matching interval to be extended at each write. @@ -2731,8 +2851,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) } } - if (user_value) - kvm->arch.user_set_tsc = true; /* * For a reliable TSC, we can match TSC offsets, and for an unstable @@ -2752,7 +2870,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) matched = true; } - __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } @@ -3130,15 +3248,17 @@ u64 get_kvmclock_ns(struct kvm *kvm) return data.clock; } -static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, struct gfn_to_pfn_cache *gpc, - unsigned int offset, - bool force_tsc_unstable) + unsigned int offset) { - struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; unsigned long flags; + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); + read_lock_irqsave(&gpc->lock, flags); while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { read_unlock_irqrestore(&gpc->lock, flags); @@ -3158,52 +3278,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, * it is consistent. */ - guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); - - if (force_tsc_unstable) - guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); smp_wmb(); - guest_hv_clock->version = ++vcpu->hv_clock.version; + guest_hv_clock->version = ++hv_clock.version; kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -static int kvm_guest_time_update(struct kvm_vcpu *v) +int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; -#ifdef CONFIG_KVM_XEN - /* - * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless - * explicitly told to use TSC as its clocksource Xen will not set this bit. - * This default behaviour led to bugs in some guest kernels which cause - * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. - */ - bool xen_pvclock_tsc_unstable = - ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; -#endif kernel_ns = 0; host_tsc = 0; @@ -3258,41 +3360,65 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_caps.has_tsc_control) + if (kvm_caps.has_tsc_control) { tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); + tgt_tsc_khz = tgt_tsc_khz ? : 1; + } if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; - kvm_xen_update_tsc_info(v); } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; - vcpu->hv_clock.flags = pvclock_flags; + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ + if (vcpu->pvclock_set_guest_stopped_request) { + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); + + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + } + + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); - if (vcpu->pv_time.active) - kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false); #ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + if (vcpu->xen.vcpu_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time), - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); if (vcpu->xen.vcpu_time_info_cache.active) - kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0, - xen_pvclock_tsc_unstable); + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); #endif - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -3375,27 +3501,17 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) /* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. + * the rest of the vcpus to remain static. * * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { unsigned long i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); struct kvm_vcpu *vcpu; + struct kvm *kvm = v->kvm; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -3403,29 +3519,6 @@ static void kvmclock_update_fn(struct work_struct *work) } } -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) -{ - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); -} - -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); -} - /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ static bool is_mci_control_msr(u32 msr) { @@ -3520,13 +3613,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) -{ - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - - return (vcpu->arch.apf.msr_en_val & mask) == mask; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -3558,7 +3644,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3642,7 +3728,7 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); } -EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { @@ -3742,12 +3828,78 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; u64 data = msr_info->data; - if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { @@ -3767,13 +3919,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_ARCH_CAPABILITIES: if (!msr_info->host_initiated || - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return KVM_MSR_RET_UNSUPPORTED; vcpu->arch.arch_capabilities = data; break; case MSR_IA32_PERF_CAPABILITIES: if (!msr_info->host_initiated || - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return KVM_MSR_RET_UNSUPPORTED; if (data & ~kvm_caps.supported_perf_cap) @@ -3797,11 +3949,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if ((!guest_has_pred_cmd_msr(vcpu))) return 1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) reserved_bits |= PRED_CMD_IBPB; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) reserved_bits |= PRED_CMD_SBPB; } @@ -3817,12 +3969,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_PRED_CMD, data); + wrmsrq(MSR_IA32_PRED_CMD, data); break; } case MSR_IA32_FLUSH_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) return 1; if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) @@ -3830,7 +3982,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!data) break; - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); break; case MSR_EFER: return set_efer(vcpu, msr_info); @@ -3873,7 +4025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: - if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); @@ -3900,10 +4052,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3920,25 +4072,22 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: if (msr_info->host_initiated) { kvm_synchronize_tsc(vcpu, &data); - } else { + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } break; case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; - /* - * KVM supports exposing PT to the guest, but does not support - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than - * XSAVES/XRSTORS to save/restore PT MSRs. - */ - if (data & ~kvm_caps.supported_xss) + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; + + if (data & ~vcpu->arch.guest_supported_xss) return 1; + if (vcpu->arch.ia32_xss == data) + break; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -3989,7 +4138,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { - vcpu->arch.apf.pageready_pending = false; + /* + * Pairs with the smp_mb__after_atomic() in + * kvm_arch_async_page_present_queued(). + */ + smp_store_mb(vcpu->arch.apf.pageready_pending, false); + kvm_check_async_pf_completion(vcpu); } break; @@ -4077,12 +4231,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; @@ -4101,7 +4255,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4111,7 +4265,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; if (data & ~kvm_guest_supported_xfd(vcpu)) @@ -4120,6 +4274,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -4128,7 +4286,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { @@ -4226,12 +4384,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: - if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.perf_capabilities; break; @@ -4432,12 +4590,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; @@ -4456,19 +4614,23 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #ifdef CONFIG_X86_64 case MSR_IA32_XFD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; break; case MSR_IA32_XFD_ERR: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) return 1; msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); @@ -4477,7 +4639,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -4489,11 +4651,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { + bool fpu_loaded = false; int i; - for (i = 0; i < msrs->nmsrs; ++i) + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); return i; } @@ -4545,6 +4721,23 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static u64 kvm_get_allowed_disable_exits(void) +{ + u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + r |= KVM_X86_DISABLE_EXITS_APERFMPERF; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } + return r; +} + #ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) @@ -4573,6 +4766,11 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4585,17 +4783,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: +#ifdef CONFIG_KVM_IOAPIC case KVM_CAP_PIT: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_REINJECT_CONTROL: +#endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: #ifdef CONFIG_KVM_HYPERV @@ -4653,6 +4854,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; case KVM_CAP_PRE_FAULT_MEMORY: @@ -4681,21 +4883,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r = KVM_X86_DISABLE_EXITS_PAUSE; - - if (!mitigate_smt_rsb) { - r |= KVM_X86_DISABLE_EXITS_HLT | - KVM_X86_DISABLE_EXITS_CSTATE; - - if (kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; - } + r = kvm_get_allowed_disable_exits(); break; case KVM_CAP_X86_SMM: if (!IS_ENABLED(CONFIG_KVM_SMM)) @@ -4716,6 +4910,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_IDS; @@ -4771,7 +4967,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; break; case KVM_CAP_DISABLE_QUIRKS2: - r = KVM_X86_VALID_QUIRKS; + r = kvm_caps.supported_quirks; break; case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; @@ -4942,21 +5138,18 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (vcpu->scheduled_out && pmu->version && pmu->event_count) { pmu->need_cleanup = true; @@ -4968,12 +5161,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_x86_call(has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpu(vcpu->cpu); } kvm_x86_call(vcpu_load)(vcpu, cpu); + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } + /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -4994,7 +5199,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) @@ -5093,6 +5299,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); @@ -5103,6 +5312,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -5424,12 +5636,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; - /* INITs are latched while in SMM */ - if (events->flags & KVM_VCPUEVENT_VALID_SMM && - (events->smi.smm || events->smi.pending) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) - return -EINVAL; - process_nmi(vcpu); /* @@ -5733,8 +5939,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; ns = get_kvmclock_base_ns(); - kvm->arch.user_set_tsc = true; - __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); r = 0; @@ -5822,15 +6027,140 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; - if (vcpu->arch.pv_cpuid.enforce) - kvm_update_pv_runtime(vcpu); - return 0; default: return -EINVAL; } } +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) +{ + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + bool load_fpu; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(vcpu, reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + if (load_fpu) + kvm_put_guest_fpu(vcpu); + return r; +} + +static int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -5947,6 +6277,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } + case KVM_GET_ONE_REG: + case KVM_SET_ONE_REG: + r = kvm_get_set_one_reg(vcpu, ioctl, argp); + break; + case KVM_GET_REG_LIST: + r = kvm_get_reg_list(vcpu, argp); + break; case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; @@ -6127,6 +6464,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; + + if (vcpu->arch.guest_tsc_protected) + goto out; + user_tsc_khz = (u32)arg; if (kvm_caps.has_tsc_control && @@ -6284,6 +6625,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_DEVICE_ATTR: r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -6330,135 +6677,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, &pic->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, &pic->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic->lock); - memcpy(&pic->pics[0], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic->lock); - memcpy(&pic->pics[1], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_set_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic); - return r; -} - -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -6471,25 +6689,13 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) struct kvm_vcpu *vcpu; unsigned long i; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, - bool line_status) -{ - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level, - line_status); - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -6501,11 +6707,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, switch (cap->cap) { case KVM_CAP_DISABLE_QUIRKS2: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + if (cap->args[0] & ~kvm_caps.supported_quirks) break; fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -6542,30 +6748,26 @@ split_irqchip_unlock: break; case KVM_CAP_X86_DISABLE_EXITS: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) + if (cap->args[0] & ~kvm_get_allowed_disable_exits()) break; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) - kvm->arch.pause_in_guest = true; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + goto disable_exits_unlock; #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." - if (!mitigate_smt_rsb) { - if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && - (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) - pr_warn_once(SMT_RSB_MSG); - - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; - } + if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && + cpu_smt_possible() && + (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_APERFMPERF))) + pr_warn_once(SMT_RSB_MSG); + kvm_disable_exits(kvm, cap->args[0]); r = 0; +disable_exits_unlock: + mutex_unlock(&kvm->lock); break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; @@ -6848,7 +7050,11 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, kvm_free_msr_filter(old_filter); - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); return 0; } @@ -6914,23 +7120,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; - int ret = 0; - - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time.active) - continue; - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } - mutex_unlock(&kvm->lock); + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) @@ -7002,14 +7200,29 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_MEMORY_ENCRYPT_OP && + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); + + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; + +#ifdef CONFIG_KVM_IOAPIC /* * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be + * that these three variables' stack usage should be * combined, not added together. */ union { @@ -7017,6 +7230,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; +#endif switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -7040,6 +7254,7 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; +#ifdef CONFIG_KVM_IOAPIC case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -7047,6 +7262,15 @@ set_identity_unlock: if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + /* + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, + * i.e. if KVM can't intercept EOIs and thus can't properly + * emulate level-triggered interrupts. + */ + r = -ENOTTY; + if (kvm->arch.has_protected_eoi) + goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; @@ -7061,7 +7285,7 @@ set_identity_unlock: goto create_irqchip_unlock; } - r = kvm_setup_default_irq_routing(kvm); + r = kvm_setup_default_ioapic_and_pic_routing(kvm); if (r) { kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); @@ -7109,7 +7333,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -7133,7 +7357,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); set_irqchip_out: @@ -7206,6 +7430,7 @@ set_pit2_out: r = kvm_vm_ioctl_reinject(kvm, &control); break; } +#endif case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); @@ -7276,23 +7501,25 @@ set_pit2_out: if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; - WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); - r = 0; - + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); + r = 0; + } + mutex_unlock(&kvm->lock); goto out; } case KVM_GET_TSC_KHZ: { r = READ_ONCE(kvm->arch.default_tsc_khz); goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: r = -ENOTTY; if (!kvm_x86_ops.mem_enc_ioctl) goto out; r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -7431,6 +7658,7 @@ static void kvm_probe_msr_to_save(u32 msr_index) case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) return; break; @@ -7443,6 +7671,24 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) return; break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; default: break; } @@ -7562,7 +7808,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -7573,7 +7819,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, @@ -7659,7 +7905,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt); static int emulator_read_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, @@ -7726,12 +7972,12 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { /* kvm_write_guest_virt_system can pull in tons of pages. */ - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system); static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len) @@ -7765,7 +8011,7 @@ int handle_ud(struct kvm_vcpu *vcpu) return kvm_emulate_instruction(vcpu, emul_type); } -EXPORT_SYMBOL_GPL(handle_ud); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud); static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write) @@ -7986,7 +8232,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -8231,8 +8477,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); } else @@ -8245,7 +8490,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_emulate_wbinvd_noskip(vcpu); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd); @@ -8432,7 +8677,7 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8455,7 +8700,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr_with_filter(vcpu, msr_index, data); + r = kvm_emulate_msr_write(vcpu, msr_index, data); if (r < 0) return X86EMUL_UNHANDLEABLE; @@ -8475,7 +8720,16 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); + + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) @@ -8511,17 +8765,17 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); } static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); } static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt) @@ -8549,11 +8803,6 @@ static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) return is_smm(emul_to_vcpu(ctxt)); } -static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) -{ - return is_guest_mode(emul_to_vcpu(ctxt)); -} - #ifndef CONFIG_KVM_SMM static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { @@ -8567,6 +8816,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } +static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) +{ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; + return 0; +} + static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) { return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); @@ -8637,9 +8894,9 @@ static const struct x86_emulate_ops emulate_ops = { .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, .is_smm = emulator_is_smm, - .is_guest_mode = emulator_is_guest_mode, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, + .get_xcr = emulator_get_xcr, .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, .is_canonical_addr = emulator_is_canonical_addr, @@ -8740,7 +8997,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); } } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt); static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata, u8 *insn_bytes, u8 insn_size) @@ -8805,13 +9062,47 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, { prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); } -EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) { __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit); + +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + u32 reason, intr_info, error_code; + struct kvm_run *run = vcpu->run; + u64 info1, info2; + int ndata = 0; + + kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2, + &intr_info, &error_code); + + run->internal.data[ndata++] = info2; + run->internal.data[ndata++] = reason; + run->internal.data[ndata++] = info1; + run->internal.data[ndata++] = gpa; + run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + run->internal.ndata = ndata; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); + +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) +{ + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { @@ -8921,7 +9212,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); /* * rflags is the old, "raw" value of the flags. The new value has @@ -8935,7 +9226,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) r = kvm_vcpu_do_singlestep(vcpu); return r; } -EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction); static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { @@ -9041,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt, + int emulation_type) +{ + u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type); + + switch (ctxt->b) { + case 0xcc: + return vector == BP_VECTOR; + case 0xcd: + return vector == ctxt->src.val; + case 0xce: + return vector == OF_VECTOR; + default: + return false; + } +} + /* * Decode an instruction for emulation. The caller is responsible for handling * code breakpoints. Note, manually detecting code breakpoints is unnecessary @@ -9066,7 +9374,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, return r; } -EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) @@ -9085,11 +9393,20 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) return 1; + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) + return 1; + + if (r == X86EMUL_UNHANDLEABLE_VECTORING) { + kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa); + return 0; + } + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); return handle_emulation_failure(vcpu, emulation_type); } - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); @@ -9142,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { + if (emulation_type & EMULTYPE_SKIP_SOFT_INT && + !is_soft_int_instruction(ctxt, emulation_type)) + return 0; + if (ctxt->mode != X86EMUL_MODE_PROT64) ctxt->eip = (u32)ctxt->_eip; else @@ -9191,7 +9512,14 @@ restart: ctxt->exception.address = 0; } - r = x86_emulate_insn(ctxt); + /* + * Check L1's instruction intercepts when emulating instructions for + * L2, unless KVM is re-emulating a previously decoded instruction, + * e.g. to complete userspace I/O, in which case KVM has already + * checked the intercepts. + */ + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && + !(emulation_type & EMULTYPE_NO_DECODE)); if (r == EMULATION_INTERCEPTED) return 1; @@ -9246,9 +9574,9 @@ writeback: */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); + kvm_pmu_instruction_retired(vcpu); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); @@ -9274,14 +9602,14 @@ int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len) { return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer); static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) { @@ -9293,7 +9621,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -9318,7 +9646,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -9331,7 +9659,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -9360,7 +9688,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -9376,7 +9704,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) ret = kvm_fast_pio_out(vcpu, size, port); return ret && kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_fast_pio); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio); static int kvmclock_cpu_down_prep(unsigned int cpu) { @@ -9693,12 +10021,24 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) * with an exception. PAT[0] is set to WB on RESET and also by the * kernel, i.e. failure indicates a kernel bug or broken firmware. */ - if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || (host_pat & GENMASK(2, 0)) != 6) { pr_err("host PAT[0] is not WB\n"); return -EIO; } + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); + /* + * Linux doesn't yet support supervisor shadow stacks (SSS), so + * KVM doesn't save/restore the associated MSRs, i.e. KVM may + * clobber the host values. Yell and refuse to load if SSS is + * unexpectedly enabled, e.g. to avoid crashing the host. + */ + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) + return -EIO; + } + memset(&kvm_caps, 0, sizeof(kvm_caps)); x86_emulator_cache = kvm_alloc_emulator_cache(); @@ -9707,17 +10047,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -ENOMEM; } - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); - if (!user_return_msrs) { - pr_err("failed to allocate percpu kvm_user_return_msrs\n"); - r = -ENOMEM; - goto out_free_x86_emulator_cache; - } - kvm_nr_uret_msrs = 0; - r = kvm_mmu_vendor_module_init(); if (r) - goto out_free_percpu; + goto out_free_x86_emulator_cache; kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; @@ -9727,20 +10059,30 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } - rdmsrl_safe(MSR_EFER, &kvm_host.efer); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + rdmsrq(MSR_IA32_XSS, kvm_host.xss); + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; + } + + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, kvm_host.xss); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + + WARN_ON_ONCE(kvm_nr_uret_msrs); r = ops->hardware_setup(); if (r != 0) goto out_mmu_exit; + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + kvm_ops_update(ops); for_each_online_cpu(cpu) { @@ -9770,12 +10112,22 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } if (kvm_caps.has_tsc_control) { /* @@ -9796,14 +10148,13 @@ out_unwind_ops: kvm_x86_ops.enable_virtualization_cpu = NULL; kvm_x86_call(hardware_unsetup)(); out_mmu_exit: + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); -out_free_percpu: - free_percpu(user_return_msrs); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) { @@ -9826,8 +10177,8 @@ void kvm_x86_vendor_exit(void) cancel_work_sync(&pvclock_gtod_work); #endif kvm_x86_call(hardware_unsetup)(); + kvm_destroy_user_return_msrs(); kvm_mmu_vendor_module_exit(); - free_percpu(user_return_msrs); kmem_cache_destroy(x86_emulator_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); @@ -9837,7 +10188,7 @@ void kvm_x86_vendor_exit(void) kvm_x86_ops.enable_virtualization_cpu = NULL; mutex_unlock(&vendor_module_lock); } -EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit); #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, @@ -9901,7 +10252,7 @@ bool kvm_apicv_activated(struct kvm *kvm) { return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); } -EXPORT_SYMBOL_GPL(kvm_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) { @@ -9911,7 +10262,7 @@ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) return (vm_reasons | vcpu_reasons) == 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated); static void set_or_clear_apicv_inhibit(unsigned long *inhibits, enum kvm_apicv_inhibit reason, bool set) @@ -9951,8 +10302,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); - if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) - target = map->phys_map[dest_id]->vcpu; + if (likely(map) && dest_id <= map->max_apic_id) { + dest_id = array_index_nospec(dest_id, map->max_apic_id + 1); + if (map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + } rcu_read_unlock(); @@ -9976,19 +10330,24 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); - ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl) +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)) { unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); + + ++vcpu->stat.hypercalls; trace_kvm_hypercall(nr, a0, a1, a2, a3); @@ -10041,7 +10400,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, u64 gpa = a0, npages = a1, attrs = a2; ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) break; if (!PAGE_ALIGNED(gpa) || !npages || @@ -10052,6 +10411,13 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; @@ -10060,8 +10426,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); - vcpu->arch.complete_userspace_io = complete_hypercall_exit; - /* stat is incremented on completion. */ + vcpu->arch.complete_userspace_io = complete_hypercall; return 0; } default: @@ -10070,43 +10435,23 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, } out: - ++vcpu->stat.hypercalls; - return ret; + vcpu->run->hypercall.ret = ret; + return 1; } -EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; - int cpl; - if (kvm_xen_hypercall_enabled(vcpu->kvm)) return kvm_xen_hypercall(vcpu); if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); - op_64_bit = is_64_bit_hypercall(vcpu); - cpl = kvm_x86_call(get_cpl)(vcpu); - - ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); - if (nr == KVM_HC_MAP_GPA_RANGE && !ret) - /* MAP_GPA tosses the request to the user space. */ - return 0; - - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); - - return kvm_skip_emulated_instruction(vcpu); + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), + complete_hypercall_exit); } -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall); static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { @@ -10549,7 +10894,7 @@ out: preempt_enable(); up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv); static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { @@ -10625,7 +10970,7 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -10633,13 +10978,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; kvm_x86_call(sync_pir_to_irr)(vcpu); if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); +#ifdef CONFIG_KVM_IOAPIC else if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); +#endif if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -10693,6 +11041,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); fastpath_t exit_fastpath; + u64 run_flags, debug_ctl; bool req_immediate_exit = false; @@ -10840,8 +11189,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - kvm_x86_call(msr_filter_changed)(vcpu); + + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) + kvm_x86_call(recalc_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) kvm_x86_call(update_cpu_dirty_logging)(vcpu); @@ -10937,28 +11287,55 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - if (req_immediate_exit) + run_flags = 0; + if (req_immediate_exit) { + run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; kvm_make_request(KVM_REQ_EVENT, vcpu); + } fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); + kvm_load_xfeatures(vcpu, true); + + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + run_flags |= KVM_RUN_LOAD_GUEST_DR6; } else if (unlikely(hw_breakpoint_active())) { - set_debugreg(0, 7); + set_debugreg(DR7_FIXED_1, 7); } + /* + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL + * can be modified in IRQ context, e.g. via SMP function calls. Inform + * vendor code if any host-owned bits were changed, e.g. so that the + * value loaded into hardware while running the guest can be updated. + */ + debug_ctl = get_debugctlmsr(); + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && + !vcpu->arch.guest_state_protected) + run_flags |= KVM_RUN_LOAD_DEBUGCTL; + vcpu->arch.host_debugctl = debug_ctl; + guest_timing_enter_irqoff(); + /* + * Swap PKRU with hardware breakpoints disabled to minimize the number + * of flows where non-KVM code can run with guest state loaded. + */ + kvm_load_guest_pkru(vcpu); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -10969,8 +11346,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, - req_immediate_exit); + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -10982,10 +11358,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) break; } + run_flags = 0; + /* Note, VM-Exits that go down the "slow" path are accounted below. */ ++vcpu->stat.exits; } + kvm_load_host_pkru(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit @@ -10994,6 +11374,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); @@ -11015,6 +11396,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_load_xfeatures(vcpu, false); + /* * Sync xfd before calling handle_exit_irqoff() which may * rely on the fact that guest_fpu::xfd is up-to-date (e.g. @@ -11026,7 +11409,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_call(handle_exit_irqoff)(vcpu); if (vcpu->arch.guest_fpu.xfd_err) - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); + + /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); /* * Consume any pending interrupts, including the possible source of @@ -11064,7 +11456,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11097,7 +11490,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11106,9 +11499,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11146,10 +11536,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11207,9 +11599,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11264,7 +11654,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { kvm_vcpu_srcu_read_unlock(vcpu); - r = xfer_to_guest_mode_handle_work(vcpu); + r = kvm_xfer_to_guest_mode_handle_work(vcpu); kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; @@ -11285,10 +11675,9 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) - vcpu->arch.pv.pv_unhalted = false; - else - vcpu->arch.mp_state = state; + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; @@ -11300,7 +11689,7 @@ int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) { return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); } -EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -11311,17 +11700,11 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) */ return kvm_emulate_halt_noskip(vcpu) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { - int ret; - - kvm_vcpu_srcu_read_lock(vcpu); - ret = kvm_emulate_halt(vcpu); - kvm_vcpu_srcu_read_unlock(vcpu); - - if (!ret) + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; if (kvm_vcpu_running(vcpu)) @@ -11329,7 +11712,7 @@ fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_EXIT_HANDLED; } -EXPORT_SYMBOL_GPL(handle_fastpath_hlt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) { @@ -11338,7 +11721,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { @@ -11444,6 +11827,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); @@ -11452,17 +11838,47 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } +static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + /* + * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and + * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted + * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be + * converted to INIT_RECEIVED. + */ + if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) + return -EINVAL; + + /* + * Disallow running the vCPU if userspace forced it into an impossible + * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked. + */ + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED && + !kvm_apic_init_sipi_allowed(vcpu)) + return -EINVAL; + + return kvm_x86_call(vcpu_pre_run)(vcpu); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u64 sync_valid_fields; int r; + r = kvm_mmu_post_init_vm(vcpu->kvm); + if (r) + return r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); kvm_run->flags = 0; @@ -11502,8 +11918,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || - (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -11553,7 +11970,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - r = kvm_x86_call(vcpu_pre_run)(vcpu); + r = kvm_x86_vcpu_pre_run(vcpu); if (r <= 0) goto out; @@ -11561,7 +11978,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_vcpu_srcu_read_unlock(vcpu); @@ -11746,8 +12163,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int r; vcpu_load(vcpu); - if (kvm_mpx_supported()) - kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_apic_accept_events(vcpu); if (r < 0) @@ -11762,8 +12178,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: - if (kvm_mpx_supported()) - kvm_put_guest_fpu(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); vcpu_put(vcpu); return r; } @@ -11793,21 +12208,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, } /* - * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow - * forcing the guest into INIT/SIPI if those events are supposed to be - * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state - * if an SMI is pending as well. + * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead + * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI. + * Translate SIPI_RECEIVED as appropriate for backwards compatibility. */ - if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && - (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || - mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) - goto out; - if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); - } else - vcpu->arch.mp_state = mp_state->mp_state; + } + + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11822,6 +12232,25 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { + u64 u_cet, s_cet; + + /* + * Check both User and Supervisor on task switches as inter- + * privilege level task switches are impacted by CET at both + * the current privilege level and the new privilege level, and + * that information is not known at this time. The expectation + * is that the guest won't require emulation of task switches + * while using IBT or Shadow Stacks. + */ + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) + goto unhandled_task_switch; + + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) + goto unhandled_task_switch; + } + init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, @@ -11831,19 +12260,21 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, * Report an error userspace if MMIO is needed, as KVM doesn't support * MMIO during a task switch (or any other complex operation). */ - if (ret || vcpu->mmio_needed) { - vcpu->mmio_needed = false; - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (ret || vcpu->mmio_needed) + goto unhandled_task_switch; kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); return 1; + +unhandled_task_switch: + vcpu->mmio_needed = false; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } -EXPORT_SYMBOL_GPL(kvm_task_switch); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch); static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -11934,7 +12365,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -12237,9 +12668,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) @@ -12276,9 +12707,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto free_emulate_ctxt; } - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); - kvm_async_pf_hash_reset(vcpu); if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { @@ -12301,6 +12729,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_xen_init_vcpu(vcpu); vcpu_load(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); @@ -12326,8 +12755,6 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); @@ -12338,18 +12765,20 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) vcpu->arch.msr_kvm_poll_control = 1; mutex_unlock(&vcpu->mutex); - - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int idx; + int idx, cpu; + + kvm_clear_async_pf_completion_queue(vcpu); + kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); @@ -12369,6 +12798,53 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvfree(vcpu->arch.cpuid_entries); } +static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + u64 xfeatures_mask; + bool fpu_in_use; + int i; + + /* + * Guest FPU state is zero allocated and so doesn't need to be manually + * cleared on RESET, i.e. during vCPU creation. + */ + if (!init_event || !fpstate) + return; + + /* + * On INIT, only select XSTATE components are zeroed, most components + * are unchanged. Currently, the only components that are zeroed and + * supported by KVM are MPX and CET related. + */ + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_CET_ALL); + if (!xfeatures_mask) + return; + + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); + + /* + * Unload guest FPU state (if necessary) before zeroing XSTATE fields + * as the kernel can only modify the state when its resident in memory, + * i.e. when it's not loaded into hardware. + * + * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN, + * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the + * only path that can trigger INIT emulation _and_ loads FPU state, and + * KVM_RUN should _always_ load FPU state. + */ + WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use); + fpu_in_use = fpstate->in_use; + if (fpu_in_use) + kvm_put_guest_fpu(vcpu); + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) + fpstate_clear_xstate_component(fpstate, i); + if (fpu_in_use) + kvm_load_guest_fpu(vcpu); +} + void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_cpuid_entry2 *cpuid_0x1; @@ -12426,22 +12902,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { - struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; - - /* - * All paths that lead to INIT are required to load the guest's - * FPU state (because most paths are buried in KVM_RUN). - */ - if (init_event) - kvm_put_guest_fpu(vcpu); - - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); - fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); - - if (init_event) - kvm_load_guest_fpu(vcpu); - } + kvm_xstate_reset(vcpu, init_event); if (!init_event) { vcpu->arch.smbase = 0x30000; @@ -12453,7 +12914,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); - __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); } /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ @@ -12519,7 +12980,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -12531,7 +12992,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } -EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector); void kvm_arch_enable_virtualization(void) { @@ -12642,13 +13103,28 @@ int kvm_arch_enable_virtualization_cpu(void) void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); - drop_user_return_notifiers(); + + /* + * Leave the user-return notifiers as-is when disabling virtualization + * for reboot, i.e. when disabling via IPI function call, and instead + * pin kvm.ko (if it's a module) to defend against use-after-free (in + * the *very* unlikely scenario module unload is racing with reboot). + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM + * could be actively modifying user-return MSR state when the IPI to + * disable virtualization arrives. Handle the extreme edge case here + * instead of trying to account for it in the normal flows. + */ + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) + drop_user_return_notifiers(); + else + __module_get(THIS_MODULE); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { @@ -12678,26 +13154,22 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Decided by the vendor code for other VM types. */ kvm->arch.pre_fault_allowed = type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; ret = kvm_page_track_init(kvm); if (ret) goto out; - kvm_mmu_init_vm(kvm); + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_cleanup_page_track; ret = kvm_x86_call(vm_init)(kvm); if (ret) goto out_uninit_mmu; - INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); atomic_set(&kvm->arch.noncoherent_dma_count, 0); - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); - raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); @@ -12717,52 +13189,28 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.hv_root_tdp = INVALID_PAGE; #endif - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); - kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); kvm_xen_init_vm(kvm); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); + } + + once_init(&kvm->arch.nx_once); return 0; out_uninit_mmu: kvm_mmu_uninit_vm(kvm); +out_cleanup_page_track: kvm_page_track_cleanup(kvm); out: return ret; } -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - return kvm_mmu_post_init_vm(kvm); -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_unload_vcpu_mmus(struct kvm *kvm) -{ - unsigned long i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); - } -} - -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_pit(kvm); -} - /** * __x86_set_memory_region: Setup KVM internal memory slot * @@ -12793,7 +13241,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; - /* Called with kvm->slots_lock held. */ + lockdep_assert_held(&kvm->slots_lock); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) return ERR_PTR_USR(-EINVAL); @@ -12826,7 +13275,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, &m); + r = kvm_set_internal_memslot(kvm, &m); if (r < 0) return ERR_PTR_USR(r); } @@ -12836,11 +13285,22 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { + /* + * Stop all background workers and kthreads before destroying vCPUs, as + * iterating over vCPUs in a different task while vCPUs are being freed + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to + * be stopped before IRQ routing is freed. + */ +#ifdef CONFIG_KVM_IOAPIC + kvm_free_pit(kvm); +#endif + kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -12859,18 +13319,19 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - kvm_unload_vcpu_mmus(kvm); - kvm_x86_call(vm_destroy)(kvm); + kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); +#ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); +#endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) @@ -12927,7 +13388,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The - * old arrays will be freed by __kvm_set_memory_region() if installing + * old arrays will be freed by kvm_set_memory_region() if installing * the new memslot is successful. */ memset(&slot->arch, 0, sizeof(slot->arch)); @@ -13020,6 +13481,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) return -EINVAL; + if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1)) + return -EINVAL; + return kvm_alloc_memslot_metadata(kvm, new); } @@ -13036,7 +13500,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); @@ -13108,7 +13572,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (READ_ONCE(eager_page_split)) kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); - if (kvm_x86_ops.cpu_dirty_log_size) { + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { @@ -13226,13 +13690,13 @@ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { @@ -13243,7 +13707,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { @@ -13258,7 +13722,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { @@ -13361,8 +13825,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && - kvm_x86_call(get_cpl)(vcpu) == 0) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { @@ -13447,18 +13911,22 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, if ((work->wakeup_all || work->notpresent_injected) && kvm_pv_async_pf_enabled(vcpu) && !apf_put_user_ready(vcpu, work->arch.token)) { - vcpu->arch.apf.pageready_pending = true; + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); kvm_apic_set_irq(vcpu, &irq, NULL); } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_APF_READY, vcpu); - if (!vcpu->arch.apf.pageready_pending) + + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ + smp_mb__after_atomic(); + + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) kvm_vcpu_kick(vcpu); } @@ -13470,25 +13938,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } -void kvm_arch_start_assignment(struct kvm *kvm) -{ - if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - kvm_x86_call(pi_start_assignment)(kvm); -} -EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); - -void kvm_arch_end_assignment(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); - -bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) -{ - return raw_atomic_read(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); - static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) { /* @@ -13497,8 +13946,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first * (or last) non-coherent device is (un)registered to so that new SPTEs * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. */ - if (kvm_mmu_may_ignore_guest_pat()) + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } @@ -13507,93 +13958,34 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm) if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return atomic_read(&kvm->arch.noncoherent_dma_count); } -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); - -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - int ret; - - irqfd->producer = prod; - kvm_arch_start_assignment(irqfd->kvm); - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - if (ret) - kvm_arch_end_assignment(irqfd->kvm); - - return ret; -} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma); -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - int ret; - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); - - kvm_arch_end_assignment(irqfd->kvm); -} - -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) -{ - return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set); -} - -bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, - struct kvm_kernel_irq_routing_entry *new) -{ - if (new->type != KVM_IRQ_ROUTING_MSI) - return true; - - return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); -} - -bool kvm_vector_hashing_enabled(void) +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { - return vector_hashing; + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; } -bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support initializing guest_memfd memory as shared for VMs + * with private memory (the private vs. shared tracking needs to be moved into + * guest_memfd). + */ +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { - return (vcpu->arch.msr_kvm_poll_control & 1) == 0; + return !kvm_arch_has_private_mem(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_no_poll); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) @@ -13608,6 +14000,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -13622,18 +14015,18 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { @@ -13658,7 +14051,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c } vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); /* * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns @@ -13687,7 +14080,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, return 0; } -EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { @@ -13751,7 +14144,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } } -EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid); static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) { @@ -13836,7 +14229,7 @@ int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write); int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data) @@ -13874,7 +14267,7 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read); static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) { @@ -13962,10 +14355,11 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, return in ? kvm_sev_es_ins(vcpu, size, port) : kvm_sev_es_outs(vcpu, size, port); } -EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); @@ -13982,7 +14376,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); @@ -13997,6 +14390,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); static int __init kvm_x86_init(void) { + kvm_init_xstate_sizes(); + kvm_mmu_x86_module_init(); mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); return 0; |
