diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
| -rw-r--r-- | arch/x86/kvm/x86.c | 9310 |
1 files changed, 5675 insertions, 3635 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c6dc1b445231..0c6d899d53dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,6 +15,7 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "irq.h" @@ -24,12 +25,14 @@ #include "tss.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" +#include "mmu/page_track.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" #include "hyperv.h" #include "lapic.h" #include "xen.h" +#include "smm.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -41,7 +44,6 @@ #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> -#include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> @@ -57,9 +59,10 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> -#include <linux/entry-kvm.h> #include <linux/suspend.h> +#include <linux/smp.h> +#include <trace/events/ipi.h> #include <trace/events/kvm.h> #include <asm/debugreg.h> @@ -68,7 +71,9 @@ #include <asm/mce.h> #include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> @@ -84,9 +89,19 @@ #include "trace.h" #define MAX_IO_MSRS 256 -#define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +/* + * Note, kvm_caps fields should *never* have default values, all fields must be + * recomputed from scratch during vendor module load, e.g. to account for a + * vendor module being reloaded with different module parameters. + */ +struct kvm_caps kvm_caps __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps); + +struct kvm_host_values kvm_host __read_mostly; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host); + +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) @@ -102,87 +117,78 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; - #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) +#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE + #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); -static void process_smi(struct kvm_vcpu *vcpu); -static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static DEFINE_MUTEX(vendor_module_lock); +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + struct kvm_x86_ops kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); #define KVM_X86_OP(func) \ DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ *(((struct kvm_x86_ops *)0)->func)); -#define KVM_X86_OP_NULL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); -EXPORT_STATIC_CALL_GPL(kvm_x86_tlb_flush_current); static bool __read_mostly ignore_msrs = 0; -module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +module_param(ignore_msrs, bool, 0644); bool __read_mostly report_ignored_msrs = true; -module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(report_ignored_msrs); +module_param(report_ignored_msrs, bool, 0644); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs); unsigned int min_timer_period_us = 200; -module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); - -static bool __read_mostly kvmclock_periodic_sync = true; -module_param(kvmclock_periodic_sync, bool, S_IRUGO); - -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); -bool __read_mostly kvm_has_bus_lock_exit; -EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); +module_param(min_timer_period_us, uint, 0644); /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; -module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +module_param(tsc_tolerance_ppm, uint, 0644); + +bool __read_mostly enable_vmware_backdoor = false; +module_param(enable_vmware_backdoor, bool, 0444); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor); /* - * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancement of 1000ns. '0' disables - * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows privileged userspace to set an exact advancement time. + * Flags to manipulate forced emulation behavior (any non-zero value will + * enable forced emulation). */ -static int __read_mostly lapic_timer_advance_ns = -1; -module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); +#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) +static int __read_mostly force_emulation_prefix; +module_param(force_emulation_prefix, int, 0644); -static bool __read_mostly vector_hashing = true; -module_param(vector_hashing, bool, S_IRUGO); +int __read_mostly pi_inject_timer = -1; +module_param(pi_inject_timer, bint, 0644); -bool __read_mostly enable_vmware_backdoor = false; -module_param(enable_vmware_backdoor, bool, S_IRUGO); -EXPORT_SYMBOL_GPL(enable_vmware_backdoor); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); +module_param(enable_pmu, bool, 0444); -static bool __read_mostly force_emulation_prefix = false; -module_param(force_emulation_prefix, bool, S_IRUGO); +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); -int __read_mostly pi_inject_timer = -1; -module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +/* Enable/disable SMT_RSB bug mitigation */ +static bool __read_mostly mitigate_smt_rsb; +module_param(mitigate_smt_rsb, bool, 0444); /* * Restoring the host value for MSRs that are only consumed when running in @@ -201,28 +207,35 @@ struct kvm_user_return_msrs { }; u32 __read_mostly kvm_nr_uret_msrs; -EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs); static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; -static struct kvm_user_return_msrs __percpu *user_return_msrs; +static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ - | XFEATURE_MASK_PKRU) + | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) -u64 __read_mostly host_efer; -EXPORT_SYMBOL_GPL(host_efer); +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) +/* + * Note, KVM supports exposing PT to the guest, but does not support context + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). + */ +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) bool __read_mostly allow_smaller_maxphyaddr = 0; -EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr); bool __read_mostly enable_apicv = true; -EXPORT_SYMBOL_GPL(enable_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv); -u64 __read_mostly host_xss; -EXPORT_SYMBOL_GPL(host_xss); -u64 __read_mostly supported_xss; -EXPORT_SYMBOL_GPL(supported_xss); +bool __read_mostly enable_ipiv = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv); + +bool __read_mostly enable_device_posted_irqs = true; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs); const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS(), @@ -233,12 +246,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -251,7 +265,12 @@ const struct kvm_stats_header kvm_vm_stats_header = { const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, pf_taken), STATS_DESC_COUNTER(VCPU, pf_fixed), + STATS_DESC_COUNTER(VCPU, pf_emulate), + STATS_DESC_COUNTER(VCPU, pf_spurious), + STATS_DESC_COUNTER(VCPU, pf_fast), + STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), STATS_DESC_COUNTER(VCPU, pf_guest), STATS_DESC_COUNTER(VCPU, tlb_flush), STATS_DESC_COUNTER(VCPU, invlpg), @@ -276,10 +295,11 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_COUNTER(VCPU, preemption_reported), + STATS_DESC_COUNTER(VCPU, preemption_other), + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -290,33 +310,246 @@ const struct kvm_stats_header kvm_vcpu_stats_header = { sizeof(kvm_vcpu_stats_desc), }; -u64 __read_mostly host_xcr0; -u64 __read_mostly supported_xcr0; -EXPORT_SYMBOL_GPL(supported_xcr0); +static struct kmem_cache *x86_emulator_cache; -static struct kmem_cache *x86_fpu_cache; +/* + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. + */ -static struct kmem_cache *x86_emulator_cache; +static const u32 msrs_to_save_base[] = { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, +}; + +static const u32 msrs_to_save_pmu[] = { + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */ + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, +}; + +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + + ARRAY_SIZE(msrs_to_save_pmu)]; +static unsigned num_msrs_to_save; + +static const u32 emulated_msrs_all[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + +#ifdef CONFIG_KVM_HYPERV + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, + HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, + HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, + HV_X64_MSR_RESET, + HV_X64_MSR_VP_INDEX, + HV_X64_MSR_VP_RUNTIME, + HV_X64_MSR_SCONTROL, + HV_X64_MSR_STIMER0_CONFIG, + HV_X64_MSR_VP_ASSIST_PAGE, + HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, + HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, +#endif + + MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, + + MSR_IA32_TSC_ADJUST, + MSR_IA32_TSC_DEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, + MSR_IA32_SMBASE, + MSR_SMI_COUNT, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, + MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, +}; + +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; +static unsigned num_emulated_msrs; /* - * When called, it means the previous get/set msr reached an invalid msr. - * Return true if we want to ignore/silent this failed msr access. + * List of MSRs that control the existence of MSR-based features, i.e. MSRs + * that are effectively CPUID leafs. VMX MSRs are also included in the set of + * feature MSRs, but are handled separately to allow expedited lookups. */ -static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) +static const u32 msr_based_features_all_except_vmx[] = { + MSR_AMD64_DE_CFG, + MSR_IA32_UCODE_REV, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, + MSR_PLATFORM_INFO, +}; + +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + + (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; +static unsigned int num_msr_based_features; + +/* + * All feature MSRs except uCode revID, which tracks the currently loaded uCode + * patch, are immutable once the vCPU model is defined. + */ +static bool kvm_is_immutable_feature_msr(u32 msr) { - const char *op = write ? "wrmsr" : "rdmsr"; + int i; - if (ignore_msrs) { - if (report_ignored_msrs) - kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", - op, msr, data); - /* Mask the error */ + if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) return true; - } else { + + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { + if (msr == msr_based_features_all_except_vmx[i]) + return msr != MSR_IA32_UCODE_REV; + } + + return false; +} + +static bool kvm_is_advertised_msr(u32 msr_index) +{ + unsigned int i; + + for (i = 0; i < num_msrs_to_save; i++) { + if (msrs_to_save[i] == msr_index) + return true; + } + + for (i = 0; i < num_emulated_msrs; i++) { + if (emulated_msrs[i] == msr_index) + return true; + } + + return false; +} + +typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated); + +static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr, + u64 *data, bool host_initiated, + enum kvm_msr_access rw, + msr_access_t msr_access_fn) +{ + const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr"; + int ret; + + BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W); + + /* + * Zero the data on read failures to avoid leaking stack data to the + * guest and/or userspace, e.g. if the failure is ignored below. + */ + ret = msr_access_fn(vcpu, msr, data, host_initiated); + if (ret && rw == MSR_TYPE_R) + *data = 0; + + if (ret != KVM_MSR_RET_UNSUPPORTED) + return ret; + + /* + * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM + * advertises to userspace, even if an MSR isn't fully supported. + * Simply check that @data is '0', which covers both the write '0' case + * and all reads (in which case @data is zeroed on failure; see above). + */ + if (host_initiated && !*data && kvm_is_advertised_msr(msr)) + return 0; + + if (!ignore_msrs) { kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", - op, msr, data); - return false; + op, msr, *data); + return ret; } + + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data); + + return 0; } static struct kmem_cache *kvm_alloc_emulator_cache(void) @@ -339,28 +572,30 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) vcpu->arch.apf.gfns[i] = ~0; } +static void kvm_destroy_user_return_msrs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered); + + kvm_nr_uret_msrs = 0; +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; struct kvm_user_return_msrs *msrs = container_of(urn, struct kvm_user_return_msrs, urn); struct kvm_user_return_msr_values *values; - unsigned long flags; - /* - * Disabling irqs at this point since the following code could be - * interrupted and executed through kvm_arch_hardware_disable() - */ - local_irq_save(flags); - if (msrs->registered) { - msrs->registered = false; - user_return_notifier_unregister(urn); - } - local_irq_restore(flags); + msrs->registered = false; + user_return_notifier_unregister(urn); + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(kvm_uret_msrs_list[slot], values->host); + wrmsrq(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } @@ -372,10 +607,10 @@ static int kvm_probe_user_return_msr(u32 msr) int ret; preempt_disable(); - ret = rdmsrl_safe(msr, &val); + ret = rdmsrq_safe(msr, &val); if (ret) goto out; - ret = wrmsrl_safe(msr, val); + ret = wrmsrq_safe(msr, val); out: preempt_enable(); return ret; @@ -391,7 +626,7 @@ int kvm_add_user_return_msr(u32 msr) kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; return kvm_nr_uret_msrs++; } -EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr); int kvm_find_user_return_msr(u32 msr) { @@ -403,94 +638,75 @@ int kvm_find_user_return_msr(u32 msr) } return -1; } -EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr); static void kvm_user_return_msr_cpu_online(void) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); u64 value; int i; for (i = 0; i < kvm_nr_uret_msrs; ++i) { - rdmsrl_safe(kvm_uret_msrs_list[i], &value); + rdmsrq_safe(kvm_uret_msrs_list[i], &value); msrs->values[i].host = value; msrs->values[i].curr = value; } } +static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs) +{ + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; + } +} + int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); int err; value = (value & mask) | (msrs->values[slot].host & ~mask); if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); + err = wrmsrq_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; msrs->values[slot].curr = value; - if (!msrs->registered) { - msrs->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&msrs->urn); - msrs->registered = true; - } + kvm_user_return_register_notifier(msrs); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); - -static void drop_user_return_notifiers(void) -{ - unsigned int cpu = smp_processor_id(); - struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - - if (msrs->registered) - kvm_on_user_return(&msrs->urn); -} - -u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.apic_base; -} -EXPORT_SYMBOL_GPL(kvm_get_apic_base); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr); -enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) +u64 kvm_get_user_return_msr(unsigned int slot) { - return kvm_apic_mode(kvm_get_apic_base(vcpu)); + return this_cpu_ptr(&user_return_msrs)->values[slot].curr; } -EXPORT_SYMBOL_GPL(kvm_get_apic_mode); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr); -int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void drop_user_return_notifiers(void) { - enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); - enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | - (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); + struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs); - if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) - return 1; - if (!msr_info->host_initiated) { - if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) - return 1; - if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) - return 1; - } - - kvm_lapic_set_base(vcpu, msr_info->data); - kvm_recalculate_apic_map(vcpu->kvm); - return 0; + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } -EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); } -EXPORT_SYMBOL_GPL(kvm_spurious_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault); #define EXCPT_BENIGN 0 #define EXCPT_CONTRIBUTORY 1 @@ -517,6 +733,7 @@ static int exception_class(int vector) #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 +#define EXCPT_DB 4 static int exception_type(int vector) { @@ -527,8 +744,14 @@ static int exception_type(int vector) mask = 1 << vector; - /* #DB is trap, as instruction watchpoints are handled elsewhere */ - if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + /* + * #DBs can be trap-like or fault-like, the caller must check other CPU + * state, e.g. DR6, to determine whether a #DB is a trap or fault. + */ + if (mask & (1 << DB_VECTOR)) + return EXCPT_DB; + + if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) @@ -538,16 +761,13 @@ static int exception_type(int vector) return EXCPT_FAULT; } -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, + struct kvm_queued_exception *ex) { - unsigned nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (!has_payload) + if (!ex->has_payload) return; - switch (nr) { + switch (ex->vector) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The @@ -572,8 +792,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * So they need to be flipped for DR6. */ vcpu->arch.dr6 |= DR6_ACTIVE_LOW; - vcpu->arch.dr6 |= payload; - vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; + vcpu->arch.dr6 |= ex->payload; + vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -584,61 +804,68 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: - vcpu->arch.cr2 = payload; + vcpu->arch.cr2 = ex->payload; break; } - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; + ex->has_payload = false; + ex->payload = 0; } -EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload); -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, + bool has_error_code, u32 error_code, + bool has_payload, unsigned long payload) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + + ex->vector = vector; + ex->injected = false; + ex->pending = true; + ex->has_error_code = has_error_code; + ex->error_code = error_code; + ex->has_payload = has_payload; + ex->payload = payload; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. + */ + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { + kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, + has_payload, payload); + return; + } + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On vmentry, vcpu->arch.exception.pending is only - * true if an event injection was blocked by - * nested_run_pending. In that case, however, - * vcpu_enter_guest requests an immediate exit, - * and the guest shouldn't proceed far enough to - * need reinjection. - */ - WARN_ON_ONCE(vcpu->arch.exception.pending); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; + vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, + &vcpu->arch.exception); return; } /* to check exception */ - prev_nr = vcpu->arch.exception.nr; + prev_nr = vcpu->arch.exception.vector; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -646,53 +873,73 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, } class1 = exception_class(prev_nr); class2 = exception_class(nr); - if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) - || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || + (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { /* - * Generate double fault per SDM Table 5-5. Set - * exception.pending = true so that the double fault - * can trigger a nested vmexit. + * Synthesize #DF. Clear the previously injected or pending + * exception so as not to incorrectly trigger shutdown. */ - vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else + vcpu->arch.exception.pending = false; + + kvm_queue_exception_e(vcpu, DF_VECTOR, 0); + } else { /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception */ goto queue; + } } void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } -EXPORT_SYMBOL_GPL(kvm_queue_exception); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } -EXPORT_SYMBOL_GPL(kvm_queue_exception_p); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -702,24 +949,37 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) return 1; } -EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp); + +static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_COMPLETE_USER_EXIT); +} void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.exception.nested_apf = - is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) { - vcpu->arch.apf.nested_apf_token = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); - } else { + + /* + * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of + * whether or not L1 wants to intercept "regular" #PF. + */ + if (is_guest_mode(vcpu) && fault->async_page_fault) + kvm_queue_exception_vmexit(vcpu, PF_VECTOR, + true, fault->error_code, + true, fault->address); + else kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); - } } -EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { struct kvm_mmu *fault_mmu; @@ -734,32 +994,24 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, */ if ((fault->error_code & PFERR_PRESENT_MASK) && !(fault->error_code & PFERR_RSVD_MASK)) - kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, - fault_mmu->root_hpa); + kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address, + KVM_MMU_ROOT_CURRENT); fault_mmu->inject_page_fault(vcpu, fault); - return fault->nested_page_fault; } -EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { atomic_inc(&vcpu->arch.nmi_queued); kvm_make_request(KVM_REQ_NMI, vcpu); } -EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); -} -EXPORT_SYMBOL_GPL(kvm_queue_exception_e); - -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e); /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue @@ -767,46 +1019,28 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) + if (kvm_x86_call(get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; } -EXPORT_SYMBOL_GPL(kvm_require_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) { - if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) return true; kvm_queue_exception(vcpu, UD_VECTOR); return false; } -EXPORT_SYMBOL_GPL(kvm_require_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) +static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); + return (vcpu->arch.apf.msr_en_val & mask) == mask; } -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { @@ -816,76 +1050,122 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == INVALID_GPA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; + + /* + * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. + * Shadow page roots need to be reconstructed instead. + */ + if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) + kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); vcpu->arch.pdptrs_from_userspace = false; -out: + return 1; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs); - return ret; +static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ +#ifdef CONFIG_X86_64 + if (cr0 & 0xffffffff00000000UL) + return false; +#endif + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return false; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return false; + + return kvm_x86_call(is_valid_cr0)(vcpu, cr0); } -EXPORT_SYMBOL_GPL(load_pdptrs); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { + /* + * CR0.WP is incorporated into the MMU role, but only for non-nested, + * indirect shadow MMUs. If paging is disabled, no updates are needed + * as there are no permission bits to emulate. If TDP is enabled, the + * MMU's metadata needs to be updated, e.g. so that emulating guest + * translations does the right thing, but there's no need to unload the + * root as CR0.WP doesn't affect SPTEs. + */ + if ((cr0 ^ old_cr0) == X86_CR0_WP) { + if (!(cr0 & X86_CR0_PG)) + return; + + if (tdp_enabled) { + kvm_init_mmu(vcpu); + return; + } + } + if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); + /* + * Clearing CR0.PG is defined to flush the TLB from the guest's + * perspective. + */ + if (!(cr0 & X86_CR0_PG)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + /* + * Check for async #PF completion events when enabling paging, + * as the vCPU may have previously encountered async #PFs (it's + * entirely legal for the guest to toggle paging on/off without + * waiting for the async #PF queue to drain). + */ + else if (kvm_pv_async_pf_enabled(vcpu)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); } if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) kvm_mmu_reset_context(vcpu); - - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } -EXPORT_SYMBOL_GPL(kvm_post_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; - cr0 |= X86_CR0_ET; - -#ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) + if (!kvm_is_valid_cr0(vcpu, cr0)) return 1; -#endif - cr0 &= ~CR0_RESERVED_BITS; - - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) - return 1; + cr0 |= X86_CR0_ET; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) - return 1; + /* Write to CR0 reserved bits are ignored, even on Intel. */ + cr0 &= ~CR0_RESERVED_BITS; #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && @@ -894,83 +1174,88 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } #endif if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && - is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) + is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && + !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) + return 1; + + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; - if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) return 1; - static_call(kvm_x86_set_cr0)(vcpu, cr0); + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr0); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } -EXPORT_SYMBOL_GPL(kvm_lmsw); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw); -void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest) { if (vcpu->arch.guest_state_protected) return; - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) + return; - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + if (vcpu->arch.xcr0 != kvm_host.xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, + load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0); - if (vcpu->arch.xsaves_enabled && - vcpu->arch.ia32_xss != host_xss) - wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); - } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) && + vcpu->arch.ia32_xss != kvm_host.xss) + wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss); +} + +static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.guest_state_protected) + return; - if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && - vcpu->arch.pkru != vcpu->arch.host_pkru) - write_pkru(vcpu->arch.pkru); + if (cpu_feature_enabled(X86_FEATURE_PKU) && + vcpu->arch.pkru != vcpu->arch.host_pkru && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) + wrpkru(vcpu->arch.pkru); } -EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); -void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) +static void kvm_load_host_pkru(struct kvm_vcpu *vcpu) { if (vcpu->arch.guest_state_protected) return; - if (static_cpu_has(X86_FEATURE_PKU) && - (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || - (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + if (cpu_feature_enabled(X86_FEATURE_PKU) && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) - write_pkru(vcpu->arch.host_pkru); - } - - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { - - if (vcpu->arch.xcr0 != host_xcr0) - xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - - if (vcpu->arch.xsaves_enabled && - vcpu->arch.ia32_xss != host_xss) - wrmsrl(MSR_IA32_XSS, host_xss); + wrpkru(vcpu->arch.host_pkru); } +} +#ifdef CONFIG_X86_64 +static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; } -EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +#endif -static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; u64 old_xcr0 = vcpu->arch.xcr0; @@ -987,7 +1272,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -1003,16 +1288,23 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } + + if ((xcr0 & XFEATURE_MASK_XTILE) && + ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) + return 1; + vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { - if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ + if (kvm_x86_call(get_cpl)(vcpu) != 0 || __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; @@ -1020,33 +1312,59 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv); -bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - if (cr4 & cr4_reserved_bits) - return false; - - if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) - return false; - - return static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); + return __kvm_is_valid_cr4(vcpu, cr4) && + kvm_x86_call(is_valid_cr4)(vcpu, cr4); } -EXPORT_SYMBOL_GPL(kvm_is_valid_cr4); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) kvm_mmu_reset_context(vcpu); + + /* + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST + * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, + * so fall through. + */ + if (!tdp_enabled && + (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) + kvm_mmu_unload(vcpu); + + /* + * The TLB has to be flushed for all PCIDs if any of the following + * (architecturally required) changes happen: + * - CR4.PCIDE is changed from 1 to 0 + * - CR4.PGE is toggled + * + * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. + */ + if (((cr4 ^ old_cr4) & X86_CR4_PGE) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + + /* + * The TLB has to be flushed for the current PCID if any of the + * following (architecturally required) changes happen: + * - CR4.SMEP is changed from 0 to 1 + * - CR4.PAE is toggled + */ + else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || + ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } -EXPORT_SYMBOL_GPL(kvm_post_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP; if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; @@ -1057,27 +1375,26 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if ((cr4 ^ old_cr4) & X86_CR4_LA57) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) + && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) - return 1; - /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) return 1; } - static_call(kvm_x86_set_cr4)(vcpu, cr4); + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr4); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4); static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) { @@ -1086,6 +1403,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) int i; /* + * MOV CR3 and INVPCID are usually not intercepted when using TDP, but + * this is reachable when running EPT=1 and unrestricted_guest=0, and + * also via the emulator. KVM's TDP page tables are not in the scope of + * the invalidation, but the guest's TLB entries need to be flushed as + * the CPU may have cached entries in its TLB for the target PCID. + */ + if (unlikely(tdp_enabled)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* * If neither the current CR3 nor any of the prev_roots use the given * PCID, then nothing needs to be done here because a resync will * happen anyway before switching to any other CR3. @@ -1095,11 +1424,19 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } + /* + * If PCID is disabled, there is no need to free prev_roots even if the + * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB + * with PCIDE=0. + */ + if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) + return; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); - kvm_mmu_free_roots(vcpu, mmu, roots_to_free); + kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); } int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) @@ -1107,9 +1444,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) bool skip_tlb_flush = false; unsigned long pcid = 0; #ifdef CONFIG_X86_64 - bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - - if (pcid_enabled) { + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; pcid = cr3 & X86_CR3_PCID_MASK; @@ -1125,17 +1460,18 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * stuff CR3, e.g. for RSM emulation, and there is no guarantee that * the current vCPU mode is accurate. */ - if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) return 1; - if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) return 1; if (cr3 != kvm_read_cr3(vcpu)) kvm_mmu_new_pgd(vcpu, cr3); vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ handle_tlb_flush: /* @@ -1150,7 +1486,7 @@ handle_tlb_flush: return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr3); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { @@ -1162,7 +1498,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } -EXPORT_SYMBOL_GPL(kvm_set_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { @@ -1171,7 +1507,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(kvm_get_cr8); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8); static void kvm_update_dr0123(struct kvm_vcpu *vcpu) { @@ -1180,7 +1516,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -1192,21 +1527,21 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - static_call(kvm_x86_set_dr7)(vcpu, dr7); + kvm_x86_call(set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } -EXPORT_SYMBOL_GPL(kvm_update_dr7); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { u64 fixed = DR6_FIXED_1; - if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; - if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) fixed |= DR6_BUS_LOCK; return fixed; } @@ -1238,34 +1573,31 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } -EXPORT_SYMBOL_GPL(kvm_set_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[array_index_nospec(dr, size)]; - break; + return vcpu->arch.db[array_index_nospec(dr, size)]; case 4: case 6: - *val = vcpu->arch.dr6; - break; + return vcpu->arch.dr6; case 5: default: /* 7 */ - *val = vcpu->arch.dr7; - break; + return vcpu->arch.dr7; } } -EXPORT_SYMBOL_GPL(kvm_get_dr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); + u32 pmc = kvm_rcx_read(vcpu); u64 data; - if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + if (kvm_pmu_rdpmc(vcpu, pmc, &data)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -1274,170 +1606,30 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) kvm_rdx_write(vcpu, data >> 32); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); - -/* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) - * extract the supported MSRs from the related const lists. - * msrs_to_save is selected from the msrs_to_save_all to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs - * may depend on host virtualization features rather than host cpu features. - */ - -static const u32 msrs_to_save_all[] = { - MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_STAR, -#ifdef CONFIG_X86_64 - MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, -#endif - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, - MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, - MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, - MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, - MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, - MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, - MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, - MSR_IA32_UMWAIT_CONTROL, - - MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, - MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, - MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, - MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, - MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, - MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, - MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, - MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, - MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, - MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, - MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, - MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, - MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, - MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, - MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, - MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, - MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, - MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, - MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, -}; - -static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; -static unsigned num_msrs_to_save; - -static const u32 emulated_msrs_all[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, - HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, - HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, - HV_X64_MSR_RESET, - HV_X64_MSR_VP_INDEX, - HV_X64_MSR_VP_RUNTIME, - HV_X64_MSR_SCONTROL, - HV_X64_MSR_STIMER0_CONFIG, - HV_X64_MSR_VP_ASSIST_PAGE, - HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, - HV_X64_MSR_TSC_EMULATION_STATUS, - HV_X64_MSR_SYNDBG_OPTIONS, - HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, - HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, - HV_X64_MSR_SYNDBG_PENDING_BUFFER, - - MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, - - MSR_IA32_TSC_ADJUST, - MSR_IA32_TSC_DEADLINE, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, - MSR_IA32_MISC_ENABLE, - MSR_IA32_MCG_STATUS, - MSR_IA32_MCG_CTL, - MSR_IA32_MCG_EXT_CTL, - MSR_IA32_SMBASE, - MSR_SMI_COUNT, - MSR_PLATFORM_INFO, - MSR_MISC_FEATURES_ENABLES, - MSR_AMD64_VIRT_SPEC_CTRL, - MSR_IA32_POWER_CTL, - MSR_IA32_UCODE_REV, - - /* - * The following list leaves out MSRs whose values are determined - * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. - * We always support the "true" VMX control MSRs, even if the host - * processor does not, so I am putting these registers here rather - * than in msrs_to_save_all. - */ - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_K7_HWCR, - MSR_KVM_POLL_CONTROL, -}; - -static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; -static unsigned num_emulated_msrs; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc); /* - * List of msr numbers which are used to expose MSR-based features that - * can be used by a hypervisor to validate requested CPU features. + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS */ -static const u32 msr_based_features_all[] = { - MSR_IA32_VMX_BASIC, - MSR_IA32_VMX_TRUE_PINBASED_CTLS, - MSR_IA32_VMX_PINBASED_CTLS, - MSR_IA32_VMX_TRUE_PROCBASED_CTLS, - MSR_IA32_VMX_PROCBASED_CTLS, - MSR_IA32_VMX_TRUE_EXIT_CTLS, - MSR_IA32_VMX_EXIT_CTLS, - MSR_IA32_VMX_TRUE_ENTRY_CTLS, - MSR_IA32_VMX_ENTRY_CTLS, - MSR_IA32_VMX_MISC, - MSR_IA32_VMX_CR0_FIXED0, - MSR_IA32_VMX_CR0_FIXED1, - MSR_IA32_VMX_CR4_FIXED0, - MSR_IA32_VMX_CR4_FIXED1, - MSR_IA32_VMX_VMCS_ENUM, - MSR_IA32_VMX_PROCBASED_CTLS2, - MSR_IA32_VMX_EPT_VPID_CAP, - MSR_IA32_VMX_VMFUNC, - - MSR_F10H_DECFG, - MSR_IA32_UCODE_REV, - MSR_IA32_ARCH_CAPABILITIES, - MSR_IA32_PERF_CAPABILITIES, -}; -static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; -static unsigned int num_msr_based_features; +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { - u64 data = 0; - - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + u64 data = kvm_host.arch_capabilities & KVM_SUPPORTED_ARCH_CAP; /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1451,7 +1643,7 @@ static u64 kvm_get_arch_capabilities(void) * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us - * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. @@ -1465,6 +1657,10 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_SSB_NO; if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + if (!boot_cpu_has_bug(X86_BUG_RFDS)) + data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* @@ -1485,60 +1681,58 @@ static u64 kvm_get_arch_capabilities(void) */ } + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + return data; } -static int kvm_get_msr_feature(struct kvm_msr_entry *msr) +static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { - switch (msr->index) { + WARN_ON_ONCE(!host_initiated); + + switch (index) { case MSR_IA32_ARCH_CAPABILITIES: - msr->data = kvm_get_arch_capabilities(); + *data = kvm_get_arch_capabilities(); + break; + case MSR_IA32_PERF_CAPABILITIES: + *data = kvm_caps.supported_perf_cap; + break; + case MSR_PLATFORM_INFO: + *data = MSR_PLATFORM_INFO_CPUID_FAULT; break; case MSR_IA32_UCODE_REV: - rdmsrl_safe(msr->index, &msr->data); + rdmsrq_safe(index, data); break; default: - return static_call(kvm_x86_get_msr_feature)(msr); + return kvm_x86_call(get_feature_msr)(index, data); } return 0; } -static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct kvm_msr_entry msr; - int r; - - msr.index = index; - r = kvm_get_msr_feature(&msr); - - if (r == KVM_MSR_RET_INVALID) { - /* Unconditionally clear the output for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - r = 0; - } - - if (r) - return r; - - *data = msr.data; - - return 0; + return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R, + kvm_get_feature_msr); } static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) + if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS)) return false; - if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT)) + return false; + + if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM)) return false; if (efer & (EFER_LME | EFER_LMA) && - !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) return false; - if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) + if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX)) return false; return true; @@ -1551,7 +1745,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) return __kvm_valid_efer(vcpu, efer); } -EXPORT_SYMBOL_GPL(kvm_valid_efer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer); static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -1574,16 +1768,19 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - r = static_call(kvm_x86_set_efer)(vcpu, efer); + r = kvm_x86_call(set_efer)(vcpu, efer); if (r) { WARN_ON(r > 0); return r; } - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) + if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); + if (!static_cpu_has(X86_FEATURE_XSAVES) && + (efer & EFER_SVME)) + kvm_hv_xsaves_xsavec_maybe_warn(vcpu); + return 0; } @@ -1591,7 +1788,7 @@ void kvm_enable_efer_bits(u64 mask) { efer_reserved_bits &= ~mask; } -EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits); bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) { @@ -1624,7 +1821,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - allowed = !!test_bit(index - start, bitmap); + allowed = test_bit(index - start, bitmap); break; } } @@ -1634,7 +1831,7 @@ out: return allowed; } -EXPORT_SYMBOL_GPL(kvm_msr_allowed); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed); /* * Write @data into the MSR specified by @index. Select MSR specific fault @@ -1647,16 +1844,13 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, { struct msr_data msr; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(data, vcpu)) + if (is_noncanonical_msr_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1673,15 +1867,15 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); + data = __canonical_address(data, max_host_virt_addr_bits()); break; case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; /* @@ -1689,34 +1883,73 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * incomplete and conflicting architectural behavior. Current * AMD CPUs completely ignore bits 63:32, i.e. they aren't * reserved and always read as zeros. Enforce Intel's reserved - * bits check if and only if the guest CPU is Intel, and clear - * the bits in all other cases. This ensures cross-vendor - * migration will provide consistent behavior for the guest. + * bits check if the guest CPU is Intel compatible, otherwise + * clear the bits. This ensures cross-vendor migration will + * provide consistent behavior for the guest. */ - if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + if (guest_cpuid_is_intel_compatible(vcpu) && (data >> 32) != 0) return 1; data = (u32)data; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; } msr.data = data; msr.index = index; msr.host_initiated = host_initiated; - return static_call(kvm_x86_set_msr)(vcpu, &msr); + return kvm_x86_call(set_msr)(vcpu, &msr); +} + +static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + return __kvm_set_msr(vcpu, index, *data, host_initiated); } static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, u32 index, u64 data, bool host_initiated) { - int ret = __kvm_set_msr(vcpu, index, data, host_initiated); - - if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(index, data, true)) - ret = 0; - - return ret; + return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W, + _kvm_set_msr); } /* @@ -1725,83 +1958,138 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; - if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) - return KVM_MSR_RET_FILTERED; - switch (index) { case MSR_TSC_AUX: if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) return 1; if (!host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && - !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; } msr.index = index; msr.host_initiated = host_initiated; - ret = static_call(kvm_x86_get_msr)(vcpu, &msr); + ret = kvm_x86_call(get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; } -static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, - u32 index, u64 *data, bool host_initiated) +int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { - int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + return __kvm_set_msr(vcpu, index, data, true); +} - if (ret == KVM_MSR_RET_INVALID) { - /* Unconditionally clear *data for simplicity */ - *data = 0; - if (kvm_msr_ignored_check(index, 0, false)) - ret = 0; - } +int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return __kvm_get_msr(vcpu, index, data, true); +} - return ret; +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R, + __kvm_get_msr); } -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { return kvm_get_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_get_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) { return kvm_set_msr_ignored_check(vcpu, index, data, false); } -EXPORT_SYMBOL_GPL(kvm_set_msr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write); -static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - int err = vcpu->run->msr.error; - if (!err) { + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_read(vcpu, index, data); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read); + +int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + return __kvm_emulate_msr_write(vcpu, index, data); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write); + + +static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) { kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); } +} + +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) +{ + return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_emulated_msr_access(vcpu); +} - return static_call(kvm_x86_complete_emulated_msr)(vcpu, err); +static int complete_fast_msr_access(struct kvm_vcpu *vcpu) +{ + return kvm_x86_call(complete_emulated_msr)(vcpu, vcpu->run->msr.error); } -static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); + complete_userspace_rdmsr(vcpu); + return complete_fast_msr_access(vcpu); +} + +static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) + kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg, + vcpu->run->msr.data); + + return complete_fast_msr_access(vcpu); } static u64 kvm_msr_reason(int r) { switch (r) { - case KVM_MSR_RET_INVALID: + case KVM_MSR_RET_UNSUPPORTED: return KVM_MSR_EXIT_REASON_UNKNOWN; case KVM_MSR_RET_FILTERED: return KVM_MSR_EXIT_REASON_FILTER; @@ -1832,180 +2120,186 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, return 1; } -static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg, + int (*complete_rdmsr)(struct kvm_vcpu *)) { - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, - complete_emulated_rdmsr, r); + u64 data; + int r; + + r = kvm_emulate_msr_read(vcpu, msr, &data); + + if (!r) { + trace_kvm_msr_read(msr, data); + + if (reg < 0) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + kvm_register_write(vcpu, reg, data); + } + } else { + /* MSR read failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0, + complete_rdmsr, r)) + return 0; + trace_kvm_msr_read_ex(msr); + } + + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { - return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, - complete_emulated_wrmsr, r); + return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1, + complete_fast_rdmsr); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr); -int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) { - u32 ecx = kvm_rcx_read(vcpu); - u64 data; - int r; + vcpu->arch.cui_rdmsr_imm_reg = reg; - r = kvm_get_msr(vcpu, ecx, &data); + return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm); - /* MSR read failed? See if we should ask user space */ - if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { - /* Bounce to user space */ - return 0; - } +static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int r; + r = kvm_emulate_msr_write(vcpu, msr, data); if (!r) { - trace_kvm_msr_read(ecx, data); - - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); + trace_kvm_msr_write(msr, data); } else { - trace_kvm_msr_read_ex(ecx); + /* MSR write failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data, + complete_fast_msr_access, r)) + return 0; + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + trace_kvm_msr_write_ex(msr, data); } - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); + return kvm_x86_call(complete_emulated_msr)(vcpu, r); } -EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - int r; - - r = kvm_set_msr(vcpu, ecx, data); - - /* MSR write failed? See if we should ask user space */ - if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) - /* Bounce to user space */ - return 0; - - /* Signal all other negative errors to userspace */ - if (r < 0) - return r; - - if (!r) - trace_kvm_msr_write(ecx, data); - else - trace_kvm_msr_write_ex(ecx, data); + return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr); - return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); +int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); } -EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) { return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); int kvm_emulate_invd(struct kvm_vcpu *vcpu) { /* Treat an INVD instruction as a NOP and just skip it. */ return kvm_emulate_as_nop(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_invd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); -int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) { - pr_warn_once("kvm: MWAIT instruction emulated as NOP!\n"); - return kvm_emulate_as_nop(vcpu); + if (!kvm_emulate_invd(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; } -EXPORT_SYMBOL_GPL(kvm_emulate_mwait); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op); -int kvm_emulate_monitor(struct kvm_vcpu *vcpu) + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - pr_warn_once("kvm: MONITOR instruction emulated as NOP!\n"); + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) + return kvm_handle_invalid_op(vcpu); + +emulate_as_nop: + pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_monitor); - -static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) { - xfer_to_guest_mode_prepare(); - return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || - xfer_to_guest_mode_work_pending(); + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait); -/* - * The fast path for frequent and performance sensitive wrmsr emulation, - * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces - * the latency of virtual IPI by avoiding the expensive bits of transitioning - * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the - * other cases which must be called after interrupts are enabled on the host. - */ -static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) { - if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) - return 1; - - if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && - ((u32)(data >> 32) != X2APIC_BROADCAST)) { - - data &= ~(1 << 12); - kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); - kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); - trace_kvm_apic_write(APIC_ICR, (u32)data); - return 0; - } - - return 1; + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor); -static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) +static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { - if (!kvm_can_use_hv_timer(vcpu)) - return 1; + xfer_to_guest_mode_prepare(); - kvm_set_lapic_tscdeadline_msr(vcpu, data); - return 0; + return READ_ONCE(vcpu->mode) == EXITING_GUEST_MODE || + kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } -fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { - u32 msr = kvm_rcx_read(vcpu); - u64 data; - fastpath_t ret = EXIT_FASTPATH_NONE; - switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_EXIT_HANDLED; - } + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || + kvm_x2apic_icr_write_fast(vcpu->arch.apic, data)) + return EXIT_FASTPATH_NONE; break; case MSR_IA32_TSC_DEADLINE: - data = kvm_read_edx_eax(vcpu); - if (!handle_fastpath_set_tscdeadline(vcpu, data)) { - kvm_skip_emulated_instruction(vcpu); - ret = EXIT_FASTPATH_REENTER_GUEST; - } + kvm_set_lapic_tscdeadline_msr(vcpu, data); break; default: - break; + return EXIT_FASTPATH_NONE; } - if (ret != EXIT_FASTPATH_NONE) - trace_kvm_msr_write(msr, data); + trace_kvm_msr_write(msr, data); - return ret; + if (!kvm_skip_emulated_instruction(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + return EXIT_FASTPATH_REENTER_GUEST; +} + +fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu) +{ + return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu), + kvm_read_edx_eax(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr); + +fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg) +{ + return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg)); } -EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm); /* * Adapt set_msr() to msr_io()'s calling convention @@ -2017,6 +2311,19 @@ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { + u64 val; + + /* + * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does + * not support modifying the guest vCPU model on the fly, e.g. changing + * the nVMX capabilities while L2 is running is nonsensical. Allow + * writes of the same value, e.g. to allow userspace to blindly stuff + * all MSRs when emulating RESET. + */ + if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + (do_get_msr(vcpu, index, &val) || *data != val)) + return -EINVAL; + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } @@ -2086,7 +2393,7 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; @@ -2109,14 +2416,9 @@ void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) return; - /* - * The guest calculates current wall clock time by adding - * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. We do the reverse here. - */ - wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); + wall_nsec = kvm_get_wall_clock_epoch(kvm); - wc.nsec = do_div(wall_nsec, 1000000000); + wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; @@ -2148,14 +2450,11 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(system_time & 1)) - return; - - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, system_time & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + if (system_time & 1) + kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); + else + kvm_gpc_deactivate(&vcpu->arch.pv_time); return; } @@ -2216,12 +2515,12 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) /* Guest TSC same frequency as host TSC? */ if (!scale) { - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -2233,10 +2532,10 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; @@ -2254,7 +2553,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio); + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -2273,7 +2572,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); + pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n", + user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); @@ -2288,31 +2588,36 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -static inline int gtod_is_based_on_tsc(int mode) +#ifdef CONFIG_X86_64 +static inline bool gtod_is_based_on_tsc(int mode) { return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } +#endif -static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) { #ifdef CONFIG_X86_64 - bool vcpus_matched; struct kvm_arch *ka = &vcpu->kvm->arch; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; - vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == - atomic_read(&vcpu->kvm->online_vcpus)); + /* + * To use the masterclock, the host clocksource must be based on TSC + * and all vCPUs must have matching TSCs. Note, the count for matching + * vCPUs doesn't include the reference vCPU, hence "+1". + */ + bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)) && + gtod_is_based_on_tsc(gtod->clock.vclock_mode); /* - * Once the masterclock is enabled, always perform request in - * order to update it. - * - * In order to enable masterclock, the host clocksource must be TSC - * and the vcpus need to have matched TSCs. When that happens, - * perform request to enable masterclock. + * Request a masterclock update if the masterclock needs to be toggled + * on/off, or when starting a new generation and the masterclock is + * enabled (compute_guest_tsc() requires the masterclock snapshot to be + * taken _after_ the new generation is created). */ - if (ka->use_master_clock || - (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) + if ((ka->use_master_clock && new_generation) || + (ka->use_master_clock != use_master_clock)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -2329,29 +2634,28 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio) +u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; } -EXPORT_SYMBOL_GPL(kvm_scale_tsc); static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } @@ -2359,37 +2663,40 @@ static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { return vcpu->arch.l1_tsc_offset + - kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio); + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } -EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) { u64 nested_offset; - if (l2_multiplier == kvm_default_tsc_scaling_ratio) + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) nested_offset = l1_offset; else nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); nested_offset += l2_offset; return nested_offset; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - if (l2_multiplier != kvm_default_tsc_scaling_ratio) + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) return mul_u64_u64_shr(l1_multiplier, l2_multiplier, - kvm_tsc_scaling_ratio_frac_bits); + kvm_caps.tsc_scaling_ratio_frac_bits); return l1_multiplier; } -EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) { + if (vcpu->arch.guest_tsc_protected) + return; + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vcpu->arch.l1_tsc_offset, l1_offset); @@ -2404,12 +2711,12 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) if (is_guest_mode(vcpu)) vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( l1_offset, - static_call(kvm_x86_get_l2_tsc_offset)(vcpu), - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_offset)(vcpu), + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_offset = l1_offset; - static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); + kvm_x86_call(write_tsc_offset)(vcpu); } static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) @@ -2420,13 +2727,12 @@ static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multipli if (is_guest_mode(vcpu)) vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( l1_multiplier, - static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + kvm_x86_call(get_l2_tsc_multiplier)(vcpu)); else vcpu->arch.tsc_scaling_ratio = l1_multiplier; - if (kvm_has_tsc_control) - static_call(kvm_x86_write_tsc_multiplier)( - vcpu, vcpu->arch.tsc_scaling_ratio); + if (kvm_caps.has_tsc_control) + kvm_x86_call(write_tsc_multiplier)(vcpu); } static inline bool kvm_check_tsc_unstable(void) @@ -2442,13 +2748,71 @@ static inline bool kvm_check_tsc_unstable(void) return check_tsc_unstable(); } -static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) +/* + * Infers attempts to synchronize the guest's tsc from host writes. Sets the + * offset for the vcpu and tracks the TSC matching generation that the vcpu + * participates in. + */ +static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, + u64 ns, bool matched, bool user_set_tsc) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.tsc_write_lock); + + if (vcpu->arch.guest_tsc_protected) + return; + + if (user_set_tsc) + vcpu->kvm->arch.user_set_tsc = true; + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = tsc; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + kvm->arch.last_tsc_offset = offset; + + vcpu->arch.last_guest_tsc = tsc; + + kvm_vcpu_write_tsc_offset(vcpu, offset); + + if (!matched) { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = tsc; + kvm->arch.cur_tsc_offset = offset; + kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { + kvm->arch.nr_vcpus_matched_tsc++; + } + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_track_tsc_matching(vcpu, !matched); +} + +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) { + u64 data = user_value ? *user_value : 0; struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - bool matched; - bool already_matched; + bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -2459,25 +2823,35 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) if (vcpu->arch.virtual_tsc_khz) { if (data == 0) { /* - * detection of vcpu initialization -- need to sync - * with other vCPUs. This particularly helps to keep - * kvm_clock stable after CPU hotplug + * Force synchronization when creating a vCPU, or when + * userspace explicitly writes a zero value. */ synchronizing = true; - } else { + } else if (kvm->arch.user_set_tsc) { u64 tsc_exp = kvm->arch.last_tsc_write + nsec_to_cycles(vcpu, elapsed); u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; /* - * Special case: TSC write with a small delta (1 second) - * of virtual cycle time against real time is - * interpreted as an attempt to synchronize the CPU. + * Here lies UAPI baggage: when a user-initiated TSC write has + * a small delta (1 second) of virtual cycle time against the + * previously set vCPU, we assume that they were intended to be + * in sync and the delta was only due to the racy nature of the + * legacy API. + * + * This trick falls down when restoring a guest which genuinely + * has been running for less time than the 1 second of imprecision + * which we allow for in the legacy API. In this case, the first + * value written by userspace (on any vCPU) should not be subject + * to this 'correction' to make it sync up with values that only + * come from the kernel's default vCPU creation. Make the 1-second + * slop hack only trigger if the user_set_tsc flag is already set. */ synchronizing = data < tsc_exp + tsc_hz && data + tsc_hz > tsc_exp; } } + /* * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the @@ -2494,51 +2868,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; } - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - kvm_vcpu_write_tsc_offset(vcpu, offset); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } - - kvm_track_tsc_matching(vcpu); - spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2550,9 +2883,9 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment, + adjustment = kvm_scale_tsc((u64) adjustment, vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -2582,14 +2915,13 @@ static u64 read_tsc(void) static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, int *mode) { - long v; u64 tsc_pg_val; + long v; switch (clock->vclock_mode) { case VDSO_CLOCKMODE_HVCLOCK: - tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), - tsc_timestamp); - if (tsc_pg_val != U64_MAX) { + if (hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp, &tsc_pg_val)) { /* TSC page valid */ *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & @@ -2615,7 +2947,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, return v * clock->mult; } -static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) +/* + * As with get_kvmclock_base_ns(), this counts from boot time, at the + * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). + */ +static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2634,6 +2970,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) return mode; } +/* + * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with + * no boot time offset. + */ +static int do_monotonic(s64 *t, u64 *tsc_timestamp) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + ns = gtod->clock.base_cycles; + ns += vgettsc(>od->clock, tsc_timestamp, &mode); + ns >>= gtod->clock.shift; + ns += ktime_to_ns(gtod->clock.offset); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + *t = ns; + + return mode; +} + static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -2655,18 +3014,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) return mode; } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and + * reports the TSC value from which it do so. Returns true if host is + * using TSC based clocksource. + */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, - tsc_timestamp)); + return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, + tsc_timestamp)); +} + +/* + * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + */ +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) +{ + /* checked again under seqlock below */ + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) + return false; + + return gtod_is_based_on_tsc(do_monotonic(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates CLOCK_REALTIME and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + * + * DO NOT USE this for anything related to migration. You want CLOCK_TAI + * for that. + */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { @@ -2726,6 +3109,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) int vclock_mode; bool host_tsc_clocksource, vcpus_matched; + lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -2750,137 +3134,177 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void __kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; - unsigned long flags; - - kvm_hv_invalidate_tsc_page(kvm); + raw_spin_lock_irq(&kvm->arch.tsc_write_lock); + write_seqcount_begin(&kvm->arch.pvclock_sc); +} +static void kvm_start_pvclock_update(struct kvm *kvm) +{ kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); +} + +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + unsigned long i; + write_seqcount_end(&ka->pvclock_sc); + raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); -#endif } -u64 get_kvmclock_ns(struct kvm *kvm) +static void kvm_update_masterclock(struct kvm *kvm) { - struct kvm_arch *ka = &kvm->arch; - struct pvclock_vcpu_time_info hv_clock; - unsigned long flags; - u64 ret; + kvm_hv_request_tsc_page_update(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); +} - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - if (!ka->use_master_clock) { - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - return get_kvmclock_base_ns() + ka->kvmclock_offset; - } +/* + * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's + * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz + * can change during boot even if the TSC is constant, as it's possible for KVM + * to be loaded before TSC calibration completes. Ideally, KVM would get a + * notification when calibration completes, but practically speaking calibration + * will complete before userspace is alive enough to create VMs. + */ +static unsigned long get_cpu_tsc_khz(void) +{ + if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); +/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ +static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + struct pvclock_vcpu_time_info hv_clock; /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - if (__this_cpu_read(cpu_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, + data->flags = 0; + if (ka->use_master_clock && + (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + + data->flags |= KVM_CLOCK_TSC_STABLE; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + unsigned seq; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + __get_kvmclock(kvm, data); + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v, - struct gfn_to_hva_cache *cache, - unsigned int offset) +u64 get_kvmclock_ns(struct kvm *kvm) { - struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; + struct kvm_clock_data data; - if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache, - &guest_hv_clock, offset, sizeof(guest_hv_clock)))) - return; + get_kvmclock(kvm, &data); + return data.clock; +} + +static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock, + struct kvm_vcpu *vcpu, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) +{ + struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; + + memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock)); + + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); - /* This VCPU is paused, but it's legal for a guest to read another + if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } + + guest_hv_clock = (void *)(gpc->khva + offset); + + /* + * This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - if (guest_hv_clock.version & 1) - ++guest_hv_clock.version; /* first time write, random junk */ - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); - if (vcpu->pvclock_set_guest_stopped_request) { - vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } + memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock)); - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + smp_wmb(); - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock)); + guest_hv_clock->version = ++hv_clock.version; - smp_wmb(); + kvm_gpc_mark_dirty_in_slot(gpc); + read_unlock_irqrestore(&gpc->lock, flags); - vcpu->hv_clock.version++; - kvm_write_guest_offset_cached(v->kvm, cache, - &vcpu->hv_clock, offset, - sizeof(vcpu->hv_clock.version)); + trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock); } -static int kvm_guest_time_update(struct kvm_vcpu *v) +int kvm_guest_time_update(struct kvm_vcpu *v) { + struct pvclock_vcpu_time_info hv_clock = {}; unsigned long flags, tgt_tsc_khz; + unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; u64 tsc_timestamp, host_tsc; - u8 pvclock_flags; bool use_master_clock; kernel_ns = 0; @@ -2890,17 +3314,18 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + tgt_tsc_khz = get_cpu_tsc_khz(); if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -2935,64 +3360,158 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz, + if (kvm_caps.has_tsc_control) { + tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, v->arch.l1_tsc_scaling_ratio); + tgt_tsc_khz = tgt_tsc_khz ? : 1; + } if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, - &vcpu->hv_clock.tsc_shift, - &vcpu->hv_clock.tsc_to_system_mul); + &vcpu->pvclock_tsc_shift, + &vcpu->pvclock_tsc_mul); vcpu->hw_tsc_khz = tgt_tsc_khz; } - vcpu->hv_clock.tsc_timestamp = tsc_timestamp; - vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + hv_clock.tsc_shift = vcpu->pvclock_tsc_shift; + hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul; + hv_clock.tsc_timestamp = tsc_timestamp; + hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; /* If the host uses TSC clocksource, then it is stable */ - pvclock_flags = 0; + hv_clock.flags = 0; if (use_master_clock) - pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; - - vcpu->hv_clock.flags = pvclock_flags; - - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v, &vcpu->pv_time, 0); - if (vcpu->xen.vcpu_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache, - offsetof(struct compat_vcpu_info, time)); - if (vcpu->xen.vcpu_time_info_set) - kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0); - if (v == kvm_get_vcpu(v->kvm, 0)) - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); + hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT; + + if (vcpu->pv_time.active) { + /* + * GUEST_STOPPED is only supported by kvmclock, and KVM's + * historic behavior is to only process the request if kvmclock + * is active/enabled. + */ + if (vcpu->pvclock_set_guest_stopped_request) { + hv_clock.flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0); + + hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED; + } + + kvm_hv_setup_tsc_page(v->kvm, &hv_clock); + +#ifdef CONFIG_KVM_XEN + /* + * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless + * explicitly told to use TSC as its clocksource Xen will not set this bit. + * This default behaviour led to bugs in some guest kernels which cause + * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. + * + * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters! + */ + if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) + hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT; + + if (vcpu->xen.vcpu_info_cache.active) + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_cache.active) + kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0); +#endif return 0; } /* + * The pvclock_wall_clock ABI tells the guest the wall clock time at + * which it started (i.e. its epoch, when its kvmclock was zero). + * + * In fact those clocks are subtly different; wall clock frequency is + * adjusted by NTP and has leap seconds, while the kvmclock is a + * simple function of the TSC without any such adjustment. + * + * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between + * that and kvmclock, but even that would be subject to change over + * time. + * + * Attempt to calculate the epoch at a given moment using the *same* + * TSC reading via kvm_get_walltime_and_clockread() to obtain both + * wallclock and kvmclock times, and subtracting one from the other. + * + * Fall back to using their values at slightly different moments by + * calling ktime_get_real_ns() and get_kvmclock_ns() separately. + */ +uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct pvclock_vcpu_time_info hv_clock; + struct kvm_arch *ka = &kvm->arch; + unsigned long seq, local_tsc_khz; + struct timespec64 ts; + uint64_t host_tsc; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + + local_tsc_khz = 0; + if (!ka->use_master_clock) + break; + + /* + * The TSC read and the call to get_cpu_tsc_khz() must happen + * on the same CPU. + */ + get_cpu(); + + local_tsc_khz = get_cpu_tsc_khz(); + + if (local_tsc_khz && + !kvm_get_walltime_and_clockread(&ts, &host_tsc)) + local_tsc_khz = 0; /* Fall back to old method */ + + put_cpu(); + + /* + * These values must be snapshotted within the seqcount loop. + * After that, it's just mathematics which can happen on any + * CPU at any time. + */ + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; + + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); + + /* + * If the conditions were right, and obtaining the wallclock+TSC was + * successful, calculate the KVM clock at the corresponding time and + * subtract one from the other to get the guest's epoch in nanoseconds + * since 1970-01-01. + */ + if (local_tsc_khz) { + kvm_get_time_scale(NSEC_PER_SEC, local_tsc_khz * NSEC_PER_USEC, + &hv_clock.tsc_shift, + &hv_clock.tsc_to_system_mul); + return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - + __pvclock_read_cycles(&hv_clock, host_tsc); + } +#endif + return ktime_get_real_ns() - get_kvmclock_ns(kvm); +} + +/* * kvmclock updates which are isolated to a given vcpu, such as * vcpu->cpu migration, should not allow system_timestamp from - * the rest of the vcpus to remain static. Otherwise ntp frequency - * correction applies to one vcpu's system_timestamp but not - * the others. + * the rest of the vcpus to remain static. * * So in those cases, request a kvmclock update for all vcpus. - * We need to rate-limit these requests though, as they can - * considerably slow guests that have a large number of vcpus. - * The time for a remote vcpu to update its kvmclock is bound - * by the delay we use to rate-limit the updates. + * The worst case for a remote vcpu to update its kvmclock + * is then bounded by maximum nohz sleep latency. */ - -#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - -static void kvmclock_update_fn(struct work_struct *work) +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { - int i; - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_update_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); + unsigned long i; struct kvm_vcpu *vcpu; + struct kvm *kvm = v->kvm; kvm_for_each_vcpu(i, vcpu, kvm) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -3000,30 +3519,14 @@ static void kvmclock_update_fn(struct work_struct *work) } } -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) { - struct kvm *kvm = v->kvm; - - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - schedule_delayed_work(&kvm->arch.kvmclock_update_work, - KVMCLOCK_UPDATE_DELAY); + return (msr & 3) == 0; } - -#define KVMCLOCK_SYNC_PERIOD (300 * HZ) - -static void kvmclock_sync_fn(struct work_struct *work) +static bool is_mci_status_msr(u32 msr) { - struct delayed_work *dwork = to_delayed_work(work); - struct kvm_arch *ka = container_of(dwork, struct kvm_arch, - kvmclock_sync_work); - struct kvm *kvm = container_of(ka, struct kvm, arch); - - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + return (msr & 3) == 1; } /* @@ -3032,7 +3535,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd_or_hygon(vcpu)) + if (guest_cpuid_is_amd_compatible(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -3044,6 +3547,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3057,44 +3561,58 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; } -static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) -{ - u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - - return (vcpu->arch.apf.msr_en_val & mask) == mask; -} - static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; @@ -3126,7 +3644,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3152,14 +3670,17 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.pv_time_enabled = false; + kvm_gpc_deactivate(&vcpu->arch.pv_time); vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_all)(vcpu); + kvm_x86_call(flush_tlb_all)(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -3167,24 +3688,56 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.tlb_flush; if (!tdp_enabled) { - /* + /* * A TLB flush on behalf of the guest is equivalent to * INVPCID(all), toggling CR4.PGE, etc., which requires - * a forced sync of the shadow page tables. Unload the - * entire MMU here and the subsequent load will sync the - * shadow page tables, and also flush the TLB. + * a forced sync of the shadow page tables. Ensure all the + * roots are synced and the guest TLB in hardware is clean. */ - kvm_mmu_unload(vcpu); - return; + kvm_mmu_sync_roots(vcpu); + kvm_mmu_sync_prev_roots(vcpu); } - static_call(kvm_x86_tlb_flush_guest)(vcpu); + kvm_x86_call(flush_tlb_guest)(vcpu); + + /* + * Flushing all "guest" TLB is always a superset of Hyper-V's fine + * grained flushing. + */ + kvm_hv_vcpu_purge_flush_tlb(vcpu); +} + + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_call(flush_tlb_current)(vcpu); +} + +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests); static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + u64 steal; + u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); @@ -3194,56 +3747,159 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = xchg(&st->preempted, 0); + u8 st_preempted = 0; + int err = -EFAULT; + + if (!user_access_begin(st, sizeof(*st))) + return; + + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; } else { - st->preempted = 0; - } + if (!user_access_begin(st, sizeof(*st))) + return; - vcpu->arch.st.preempted = 0; + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); + version += 1; + unsafe_put_user(version, &st->version, out); - st->version += 1; + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); +} - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. + * + * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + * + * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the + * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only + * consumed when transitioning to lower privilege levels, i.e. are effectively + * only consumed by userspace as well. + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - bool pr = false; u32 msr = msr_info->index; u64 data = msr_info->data; - if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + /* + * Do not allow host-initiated writes to trigger the Xen hypercall + * page setup; it could incur locking paths which are not expected + * if userspace sets the MSR in an unusual location. + */ + if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) && + !msr_info->host_initiated) return kvm_xen_write_hypercall_page(vcpu, data); switch (msr) { @@ -3253,6 +3909,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: break; @@ -3261,24 +3918,72 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.microcode_version = data; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; vcpu->arch.arch_capabilities = data; break; - case MSR_IA32_PERF_CAPABILITIES: { - struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated || + !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; - if (!msr_info->host_initiated) - return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) - return 1; - if (data & ~msr_ent.data) + if (data & ~kvm_caps.supported_perf_cap) return 1; + /* + * Note, this is not just a performance optimization! KVM + * disallows changing feature MSRs after the vCPU has run; PMU + * refresh will bug the VM if called after the vCPU has run. + */ + if (vcpu->arch.perf_capabilities == data) + break; + vcpu->arch.perf_capabilities = data; + kvm_pmu_refresh(vcpu); + break; + case MSR_IA32_PRED_CMD: { + u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); - return 0; + if (!msr_info->host_initiated) { + if ((!guest_has_pred_cmd_msr(vcpu))) + return 1; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; } + + if (!boot_cpu_has(X86_FEATURE_IBPB)) + reserved_bits |= PRED_CMD_IBPB; + + if (!boot_cpu_has(X86_FEATURE_SBPB)) + reserved_bits |= PRED_CMD_SBPB; + + if (data & reserved_bits) + return 1; + + if (!data) + break; + + wrmsrq(MSR_IA32_PRED_CMD, data); + break; + } + case MSR_IA32_FLUSH_CMD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)) + return 1; + + if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) + return 1; + if (!data) + break; + + wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + break; case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -3286,53 +3991,78 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - /* Handle McStatusWrEn */ - if (data == BIT_ULL(18)) { - vcpu->arch.msr_hwcr = data; - } else if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + /* + * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 + * through at least v6.6 whine if TscFreqSel is clear, + * depending on F/M/S. + */ + if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } + vcpu->arch.msr_hwcr = data; break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); return 1; } break; - case 0x200 ... 0x2ff: + case MSR_IA32_CR_PAT: + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: - return kvm_set_apic_base(vcpu, msr_info); + return kvm_apic_set_base(vcpu, data, msr_info->host_initiated); case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: - if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { + if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) { if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; @@ -3341,25 +4071,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_TSC: if (msr_info->host_initiated) { - kvm_synchronize_tsc(vcpu, data); - } else { + kvm_synchronize_tsc(vcpu, &data); + } else if (!vcpu->arch.guest_tsc_protected) { u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; adjust_tsc_offset_guest(vcpu, adj); vcpu->arch.ia32_tsc_adjust_msr += adj; } break; case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; - /* - * KVM supports exposing PT to the guest, but does not support - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than - * XSAVES/XRSTORS to save/restore PT MSRs. - */ - if (data & ~supported_xss) + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; + + if (data & ~vcpu->arch.guest_supported_xss) return 1; + if (vcpu->arch.ia32_xss == data) + break; vcpu->arch.ia32_xss = data; + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -3407,10 +4135,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; if (data & 0x1) { - vcpu->arch.apf.pageready_pending = false; + /* + * Pairs with the smp_mb__after_atomic() in + * kvm_arch_async_page_present_queued(). + */ + smp_store_mb(vcpu->arch.apf.pageready_pending, false); + kvm_check_async_pf_completion(vcpu); } break; @@ -3436,7 +4169,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -3454,20 +4187,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; - fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (pr || data != 0) - vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + if (data) + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_K7_CLK_CTL: /* @@ -3479,6 +4210,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * the need to ignore the workaround. */ break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -3488,30 +4220,28 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_set_msr_common(vcpu, msr, data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); + kvm_pr_unimpl_wrmsr(vcpu, msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.length = data; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; vcpu->arch.osvw.status = data; break; case MSR_PLATFORM_INFO: - if (!msr_info->host_initiated || - (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && - cpuid_fault_enabled(vcpu))) + if (!msr_info->host_initiated) return 1; vcpu->arch.msr_platform_info = data; break; @@ -3522,20 +4252,48 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.msr_misc_features_enables = data; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + vcpu->arch.guest_fpu.xfd_err = data; + break; +#endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - return KVM_MSR_RET_INVALID; + + return KVM_MSR_RET_UNSUPPORTED; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3553,16 +4311,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3588,6 +4357,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: + case MSR_AMD64_TW_CFG: case MSR_F15H_EX_CFG: /* * Intel Sandy Bridge CPUs must support the RAPL (running average power @@ -3602,13 +4372,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; - case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info); - if (!msr_info->host_initiated) - return 1; - msr_info->data = 0; - break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: @@ -3621,15 +4384,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.microcode_version; break; case MSR_IA32_ARCH_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) - return 1; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.arch_capabilities; break; case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return KVM_MSR_RET_UNSUPPORTED; msr_info->data = vcpu->arch.perf_capabilities; break; case MSR_IA32_POWER_CTL: @@ -3655,11 +4416,15 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) ratio = vcpu->arch.tsc_scaling_ratio; } - msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset; + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } + case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -3679,7 +4444,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - msr_info->data = kvm_get_apic_base(vcpu); + msr_info->data = vcpu->arch.apic_base; break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); @@ -3693,7 +4458,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.ia32_misc_enable_msr; break; case MSR_IA32_SMBASE: - if (!msr_info->host_initiated) + if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) return 1; msr_info->data = vcpu->arch.smbase; break; @@ -3746,7 +4511,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_ASYNC_PF_ACK: - if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) return 1; msr_info->data = 0; @@ -3775,6 +4540,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -3795,6 +4561,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ msr_info->data = 0x20000000; break; +#ifdef CONFIG_KVM_HYPERV case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: case HV_X64_MSR_SYNDBG_OPTIONS: @@ -3804,9 +4571,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_TSC_INVARIANT_CONTROL: return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); +#endif case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -3821,12 +4590,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: - if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW)) return 1; msr_info->data = vcpu->arch.osvw.status; break; @@ -3842,14 +4611,35 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.xfd_err; + break; +#endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - return KVM_MSR_RET_INVALID; + + return KVM_MSR_RET_UNSUPPORTED; } return 0; } -EXPORT_SYMBOL_GPL(kvm_get_msr_common); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common); /* * Read or write a bunch of msrs. All parameters are kernel addresses. @@ -3861,11 +4651,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { + bool fpu_loaded = false; int i; - for (i = 0; i < msrs->nmsrs; ++i) + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); return i; } @@ -3882,8 +4686,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, { struct kvm_msrs msrs; struct kvm_msr_entry *entries; - int r, n; unsigned size; + int r; r = -EFAULT; if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) @@ -3900,17 +4704,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, goto out; } - r = n = __msr_io(vcpu, &msrs, entries, do_msr); - if (r < 0) - goto out_free; + r = __msr_io(vcpu, &msrs, entries, do_msr); - r = -EFAULT; if (writeback && copy_to_user(user_msrs->entries, entries, size)) - goto out_free; - - r = n; + r = -EFAULT; -out_free: kfree(entries); out: return r; @@ -3923,6 +4721,24 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static u64 kvm_get_allowed_disable_exits(void) +{ + u64 r = KVM_X86_DISABLE_EXITS_PAUSE; + + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + r |= KVM_X86_DISABLE_EXITS_APERFMPERF; + + if (!mitigate_smt_rsb) { + r |= KVM_X86_DISABLE_EXITS_HLT | + KVM_X86_DISABLE_EXITS_CSTATE; + + if (kvm_can_mwait_in_guest()) + r |= KVM_X86_DISABLE_EXITS_MWAIT; + } + return r; +} + +#ifdef CONFIG_KVM_HYPERV static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 __user *cpuid_arg) { @@ -3943,6 +4759,17 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, return 0; } +#endif + +static bool kvm_is_vm_type_supported(unsigned long type) +{ + return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); +} + +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) +{ + return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; +} int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { @@ -3956,22 +4783,27 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_EXT_CPUID: case KVM_CAP_EXT_EMUL_CPUID: case KVM_CAP_CLOCKSOURCE: +#ifdef CONFIG_KVM_IOAPIC case KVM_CAP_PIT: + case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: + case KVM_CAP_REINJECT_CONTROL: +#endif case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: case KVM_CAP_USER_NMI: - case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: - case KVM_CAP_PIT2: - case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: case KVM_CAP_VCPU_EVENTS: +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: + case KVM_CAP_HYPERV_TIME: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_HYPERV_VP_INDEX: @@ -3981,6 +4813,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_CPUID: case KVM_CAP_HYPERV_ENFORCE_CPUID: case KVM_CAP_SYS_HYPERV_CPUID: +#endif case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -3989,8 +4822,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: - case KVM_CAP_READONLY_MEM: - case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: case KVM_CAP_DISABLE_QUIRKS: @@ -3998,9 +4829,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_PMU_EVENT_FILTER: + case KVM_CAP_PMU_EVENT_MASKED_EVENTS: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_LAST_CPU: case KVM_CAP_X86_USER_SPACE_MSR: @@ -4010,10 +4843,26 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_MEMORY_FAULT_INFO: + case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; + case KVM_CAP_PRE_FAULT_MEMORY: + r = tdp_enabled; + break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: + r = APIC_BUS_CYCLE_NS_DEFAULT; + break; case KVM_CAP_EXIT_HYPERCALL: r = KVM_EXIT_HYPERCALL_VALID_MASK; break; @@ -4023,24 +4872,29 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_XEN_HVM: r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | - KVM_XEN_HVM_CONFIG_SHARED_INFO; + KVM_XEN_HVM_CONFIG_SHARED_INFO | + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND | + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | + KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; if (sched_info_on()) - r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + r |= KVM_XEN_HVM_CONFIG_RUNSTATE | + KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; break; #endif case KVM_CAP_SYNC_REGS: - r = KVM_SYNC_X86_VALID_FIELDS; + r = kvm_sync_valid_fields(kvm); break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | - KVM_X86_DISABLE_EXITS_CSTATE; - if(kvm_can_mwait_in_guest()) - r |= KVM_X86_DISABLE_EXITS_MWAIT; + r = kvm_get_allowed_disable_exits(); break; case KVM_CAP_X86_SMM: + if (!IS_ENABLED(CONFIG_KVM_SMM)) + break; + /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, * so do not report SMM to be available if real mode is @@ -4049,19 +4903,18 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); - break; - case KVM_CAP_VAPIC: - r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); + r = kvm_x86_call(has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; + if (kvm) + r = kvm->max_vcpus; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -4073,7 +4926,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: - r = kvm_has_tsc_control; + case KVM_CAP_VM_TSC_CONTROL: + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; @@ -4082,12 +4936,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = kvm_x86_ops.nested_ops->get_state ? kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops.enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_l2_tlb_flush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; +#endif case KVM_CAP_SMALLER_MAXPHYADDR: r = (int) allow_smaller_maxphyaddr; break; @@ -4095,17 +4951,77 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = sched_info_on(); break; case KVM_CAP_X86_BUS_LOCK_EXIT: - if (kvm_has_bus_lock_exit) + if (kvm_caps.has_bus_lock_exit) r = KVM_BUS_LOCK_DETECTION_OFF | KVM_BUS_LOCK_DETECTION_EXIT; else r = 0; break; + case KVM_CAP_XSAVE2: { + r = xstate_required_size(kvm_get_filtered_xcr0(), false); + if (r < sizeof(struct kvm_xsave)) + r = sizeof(struct kvm_xsave); + break; + } + case KVM_CAP_PMU_CAPABILITY: + r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; + break; + case KVM_CAP_DISABLE_QUIRKS2: + r = kvm_caps.supported_quirks; + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; + break; + case KVM_CAP_VM_TYPES: + r = kvm_caps.supported_vm_types; + break; + case KVM_CAP_READONLY_MEM: + r = kvm ? kvm_arch_has_readonly_mem(kvm) : 1; + break; default: break; } return r; +} +static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val) +{ + if (attr->group) { + if (kvm_x86_ops.dev_get_attr) + return kvm_x86_call(dev_get_attr)(attr->group, attr->attr, val); + return -ENXIO; + } + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + *val = kvm_caps.supported_xcr0; + return 0; + default: + return -ENXIO; + } +} + +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = u64_to_user_ptr(attr->addr); + int r; + u64 val; + + r = __kvm_x86_dev_get_attr(attr, &val); + if (r < 0) + return r; + + if (put_user(val, uaddr)) + return -EFAULT; + + return 0; +} + +static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) +{ + u64 val; + + return __kvm_x86_dev_get_attr(attr, &val); } long kvm_arch_dev_ioctl(struct file *filp, @@ -4163,8 +5079,8 @@ long kvm_arch_dev_ioctl(struct file *filp, } case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -4191,11 +5107,29 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(NULL, argp, do_get_msr_feature, 1); + r = msr_io(NULL, argp, do_get_feature_msr, 1); break; +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); break; +#endif + case KVM_GET_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_get_attr(&attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_has_attr(&attr); + break; + } default: r = -EINVAL; break; @@ -4204,28 +5138,46 @@ out: return r; } -static void wbinvd_ipi(void *garbage) -{ - wbinvd(); -} - static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) { return kvm_arch_has_noncoherent_dma(vcpu->kvm); } +static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu); + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + kvm_request_l1tf_flush_l1d(); + + if (vcpu->scheduled_out && pmu->version && pmu->event_count) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } + /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (static_call(kvm_x86_has_wbinvd_exit)()) + if (kvm_x86_call(has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) - smp_call_function_single(vcpu->cpu, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpu(vcpu->cpu); } - static_call(kvm_x86_vcpu_load)(vcpu, cpu); + kvm_x86_call(vcpu_load)(vcpu, cpu); + + if (vcpu != per_cpu(last_vcpu, cpu)) { + /* + * Flush the branch predictor when switching vCPUs on the same + * physical CPU, as each vCPU needs its own branch prediction + * domain. No IBPB is needed when switching between L1 and L2 + * on the same vCPU unless IBRS is advertised to the vCPU; that + * is handled on the nested VM-Exit path. + */ + if (static_branch_likely(&switch_vcpu_ibpb)) + indirect_branch_prediction_barrier(); + per_cpu(last_vcpu, cpu) = vcpu; + } /* Save host pkru register if supported */ vcpu->arch.host_pkru = read_pkru(); @@ -4247,7 +5199,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); - vcpu->arch.tsc_catchup = 1; + if (!vcpu->arch.guest_tsc_protected) + vcpu->arch.tsc_catchup = 1; } if (kvm_lapic_hv_timer_in_use(vcpu)) @@ -4269,60 +5222,87 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* + * The vCPU can be marked preempted if and only if the VM-Exit was on + * an instruction boundary and will not trigger guest emulation of any + * kind (see vcpu_run). Vendor specific code controls (conservatively) + * when this is true, for example allowing the vCPU to be marked + * preempted if and only if the VM-Exit was due to a host interrupt. + */ + if (!vcpu->arch.at_instruction_boundary) { + vcpu->stat.preemption_other++; + return; + } + vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) + return; - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted && !vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + if (vcpu->preempted) { + /* + * Assume protected guests are in-kernel. Inefficient yielding + * due to false positives is preferable to never yielding due + * to false negatives. + */ + vcpu->arch.preempted_in_kernel = vcpu->arch.guest_state_protected || + !kvm_x86_call(get_cpl_no_cache)(vcpu); - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_xen_msr_enabled(vcpu->kvm)) - kvm_xen_runstate_set_preempted(vcpu); - else - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_xen_msr_enabled(vcpu->kvm)) + kvm_xen_runstate_set_preempted(vcpu); + else + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } - static_call(kvm_x86_vcpu_put)(vcpu); + kvm_x86_call(vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + + kvm_x86_call(sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -4332,6 +5312,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, { int r; + if (vcpu->arch.apic->guest_apic_protected) + return -EINVAL; + r = kvm_apic_set_state(vcpu, s); if (r) return r; @@ -4358,8 +5341,17 @@ static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { - return kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_accept_dm_intr(vcpu); + /* + * Do not cause an interrupt window exit if an exception + * is pending or an event needs reinjection; userspace + * might want to inject the interrupt manually using KVM_SET_REGS + * or KVM_SET_SREGS. For that to work, we must be at an + * instruction boundary and with no events half-injected. + */ + return (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_accept_dm_intr(vcpu) && + !kvm_event_needs_reinjection(vcpu) && + !kvm_is_exception_pending(vcpu)); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -4396,13 +5388,6 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } -static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_SMI, vcpu); - - return 0; -} - static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -4421,22 +5406,63 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); - static_call(kvm_x86_setup_mce)(vcpu); + kvm_x86_call(setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -4446,6 +5472,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -4453,7 +5485,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -4462,7 +5493,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { + !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -4488,25 +5519,42 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + struct kvm_queued_exception *ex; + process_nmi(vcpu); +#ifdef CONFIG_KVM_SMM if (kvm_check_request(KVM_REQ_SMI, vcpu)) process_smi(vcpu); +#endif /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). Unless the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of - * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we - * opportunistically defer the exception payload, deliver it if the - * capability hasn't been requested before processing a - * KVM_GET_VCPU_EVENTS. + * KVM's ABI only allows for one exception to be migrated. Luckily, + * the only time there can be two queued exceptions is if there's a + * non-exiting _injected_ exception, and a pending exiting exception. + * In that case, ignore the VM-Exiting exception as it's an extension + * of the injected exception. + */ + if (vcpu->arch.exception_vmexit.pending && + !vcpu->arch.exception.pending && + !vcpu->arch.exception.injected) + ex = &vcpu->arch.exception_vmexit; + else + ex = &vcpu->arch.exception; + + /* + * In guest mode, payload delivery should be deferred if the exception + * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 + * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not + * propagate the payload and so it cannot be safely deferred. Deliver + * the payload if the capability hasn't been requested. */ if (!vcpu->kvm->arch.exception_payload_enabled && - vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) - kvm_deliver_exception_payload(vcpu); + ex->pending && ex->has_payload) + kvm_deliver_exception_payload(vcpu, ex); + + memset(events, 0, sizeof(*events)); /* * The API doesn't provide the instruction length for software @@ -4514,44 +5562,40 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { - events->exception.injected = 0; - events->exception.pending = 0; - } else { - events->exception.injected = vcpu->arch.exception.injected; - events->exception.pending = vcpu->arch.exception.pending; + if (!kvm_exception_is_soft(ex->vector)) { + events->exception.injected = ex->injected; + events->exception.pending = ex->pending; /* * For ABI compatibility, deliberately conflate * pending and injected exceptions when * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. */ if (!vcpu->kvm->arch.exception_payload_enabled) - events->exception.injected |= - vcpu->arch.exception.pending; + events->exception.injected |= ex->pending; } - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.error_code = vcpu->arch.exception.error_code; - events->exception_has_payload = vcpu->arch.exception.has_payload; - events->exception_payload = vcpu->arch.exception.payload; + events->exception.nr = ex->vector; + events->exception.has_error_code = ex->has_error_code; + events->exception.error_code = ex->error_code; + events->exception_has_payload = ex->has_payload; + events->exception_payload = ex->payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = 0; - events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + events->interrupt.shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); - events->nmi.pad = 0; + events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); + events->nmi.masked = kvm_x86_call(get_nmi_mask)(vcpu); - events->sipi_vector = 0; /* never valid when reporting to user space */ + /* events->sipi_vector is never valid when reporting to user space */ +#ifdef CONFIG_KVM_SMM events->smi.smm = is_smm(vcpu); events->smi.pending = vcpu->arch.smi_pending; events->smi.smm_inside_nmi = !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); +#endif events->smi.latched_init = kvm_lapic_latched_init(vcpu); events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING @@ -4559,12 +5603,12 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; - - memset(&events->reserved, 0, sizeof(events->reserved)); + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } } -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); - static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { @@ -4572,7 +5616,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -4591,16 +5636,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; - /* INITs are latched while in SMM */ - if (events->flags & KVM_VCPUEVENT_VALID_SMM && - (events->smi.smm || events->smi.pending) && - vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) - return -EINVAL; - process_nmi(vcpu); + + /* + * Flag that userspace is stuffing an exception, the next KVM_RUN will + * morph the exception to a VM-Exit if appropriate. Do this only for + * pending exceptions, already-injected exceptions are not subject to + * intercpetion. Note, userspace that conflates pending and injected + * is hosed, and will incorrectly convert an injected exception into a + * pending exception, which in turn may cause a spurious VM-Exit. + */ + vcpu->arch.exception_from_userspace = events->exception.pending; + + vcpu->arch.exception_vmexit.pending = false; + vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; - vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.vector = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; vcpu->arch.exception.has_payload = events->exception_has_payload; @@ -4610,21 +5662,28 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - static_call(kvm_x86_set_interrupt_shadow)(vcpu, - events->interrupt.shadow); + kvm_x86_call(set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; - if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) - vcpu->arch.nmi_pending = events->nmi.pending; - static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { + vcpu->arch.nmi_pending = 0; + atomic_set(&vcpu->arch.nmi_queued, events->nmi.pending); + if (events->nmi.pending) + kvm_make_request(KVM_REQ_NMI, vcpu); + } + kvm_x86_call(set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) +#ifdef CONFIG_KVM_SMM + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + kvm_leave_nested(vcpu); kvm_smm_changed(vcpu, events->smi.smm); + } vcpu->arch.smi_pending = events->smi.pending; @@ -4635,6 +5694,12 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; } +#else + if (events->smi.smm || events->smi.pending || + events->smi.smm_inside_nmi) + return -EINVAL; +#endif + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); @@ -4643,27 +5708,49 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } -static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, - struct kvm_debugregs *dbgregs) +static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) { - unsigned long val; + unsigned int i; - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - kvm_get_dr(vcpu, 6, &val); - dbgregs->dr6 = val; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + + memset(dbgregs, 0, sizeof(*dbgregs)); + + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + dbgregs->db[i] = vcpu->arch.db[i]; + + dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); + return 0; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned int i; + + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (dbgregs->flags) return -EINVAL; @@ -4672,7 +5759,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + vcpu->arch.db[i] = dbgregs->db[i]; + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; @@ -4681,158 +5770,69 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) +static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - } else { - src = get_xsave_addr(xsave, xfeature_nr); - if (src) - memcpy(dest + offset, src, size); - } - - valid -= xfeature_mask; - } -} - -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) -{ - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. + * Only copy state for features that are enabled for the guest. The + * state itself isn't problematic, but setting bits in the header for + * features that are supported in *this* host but not exposed to the + * guest can result in KVM_SET_XSAVE failing when live migrating to a + * compatible host without the features that are NOT exposed to the + * guest. + * + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u32 size, offset, ecx, edx; - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); + u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; - if (xfeature_nr == XFEATURE_PKRU) { - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - } else { - void *dest = get_xsave_addr(xsave, xfeature_nr); + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; - if (dest) - memcpy(dest, src + offset, size); - } - - valid -= xfeature_mask; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, + supported_xcr0, vcpu->arch.pkru); + return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) +static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) { - if (!vcpu->arch.guest_fpu) - return; - - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv; - u32 mxcsr; - - if (!vcpu->arch.guest_fpu) - return 0; - - xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } -static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, - struct kvm_xcrs *guest_xcrs) +static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) { guest_xcrs->nr_xcrs = 0; - return; + return 0; } guest_xcrs->nr_xcrs = 1; guest_xcrs->flags = 0; guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; + return 0; } static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, @@ -4840,6 +5840,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, { int i, r = 0; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -EINVAL; @@ -4866,24 +5870,124 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return 0; } -static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, - struct kvm_enable_cap *cap) +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) { int r; - uint16_t vmcs_version; - void __user *user_ptr; + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = u64_to_user_ptr(attr->addr); + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = -EFAULT; + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) + break; + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = u64_to_user_ptr(attr->addr); + struct kvm *kvm = vcpu->kvm; + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: { + u64 offset, tsc, ns; + unsigned long flags; + bool matched; + + r = -EFAULT; + if (get_user(offset, uaddr)) + break; + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + + matched = (vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_offset == offset); + + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + ns = get_kvmclock_base_ns(); + + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + r = 0; + break; + } + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, + unsigned int ioctl, + void __user *argp) +{ + struct kvm_device_attr attr; + int r; + + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + if (attr.group != KVM_VCPU_TSC_CTRL) + return -ENXIO; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + r = kvm_arch_tsc_has_attr(vcpu, &attr); + break; + case KVM_GET_DEVICE_ATTR: + r = kvm_arch_tsc_get_attr(vcpu, &attr); + break; + case KVM_SET_DEVICE_ATTR: + r = kvm_arch_tsc_set_attr(vcpu, &attr); + break; + } + + return r; +} + +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ if (cap->flags) return -EINVAL; switch (cap->cap) { +#ifdef CONFIG_KVM_HYPERV case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; @@ -4895,36 +5999,168 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_ops->enable_evmcs) - return -ENOTTY; - r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); - if (!r) { - user_ptr = (void __user *)(uintptr_t)cap->args[0]; - if (copy_to_user(user_ptr, &vmcs_version, - sizeof(vmcs_version))) - r = -EFAULT; + { + int r; + uint16_t vmcs_version; + void __user *user_ptr; + + if (!kvm_x86_ops.nested_ops->enable_evmcs) + return -ENOTTY; + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); + if (!r) { + user_ptr = (void __user *)(uintptr_t)cap->args[0]; + if (copy_to_user(user_ptr, &vmcs_version, + sizeof(vmcs_version))) + r = -EFAULT; + } + return r; } - return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops.enable_direct_tlbflush) + if (!kvm_x86_ops.enable_l2_tlb_flush) return -ENOTTY; - return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + return kvm_x86_call(enable_l2_tlb_flush)(vcpu); case KVM_CAP_HYPERV_ENFORCE_CPUID: return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); +#endif case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: vcpu->arch.pv_cpuid.enforce = cap->args[0]; - if (vcpu->arch.pv_cpuid.enforce) - kvm_update_pv_runtime(vcpu); - return 0; default: return -EINVAL; } } +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) +{ + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + bool load_fpu; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(vcpu, reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + if (load_fpu) + kvm_put_guest_fpu(vcpu); + return r; +} + +static int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4947,8 +6183,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), - GFP_KERNEL_ACCOUNT); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; if (!u.lapic) @@ -4989,7 +6224,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SMI: { - r = kvm_vcpu_ioctl_smi(vcpu); + r = kvm_inject_smi(vcpu); break; } case KVM_SET_CPUID: { @@ -5042,6 +6277,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } + case KVM_GET_ONE_REG: + case KVM_SET_ONE_REG: + r = kvm_get_set_one_reg(vcpu, ioctl, argp); + break; + case KVM_GET_REG_LIST: + r = kvm_get_reg_list(vcpu, argp); + break; case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; @@ -5108,13 +6350,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) break; + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + kvm_vcpu_srcu_read_unlock(vcpu); break; } case KVM_GET_DEBUGREGS: { struct kvm_debugregs dbgregs; - kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, &dbgregs, @@ -5135,12 +6381,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); + r = -EINVAL; + if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) + break; + + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); r = -ENOMEM; if (!u.xsave) break; - kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) @@ -5149,7 +6401,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = memdup_user(argp, size); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; @@ -5158,13 +6412,36 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + + case KVM_GET_XSAVE2: { + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = kzalloc(size, GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + if (r < 0) + break; + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, size)) + break; + + r = 0; + break; + } + case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); r = -ENOMEM; if (!u.xcrs) break; - kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + if (r < 0) + break; r = -EFAULT; if (copy_to_user(argp, u.xcrs, @@ -5187,10 +6464,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; + + if (vcpu->arch.guest_tsc_protected) + goto out; + user_tsc_khz = (u32)arg; - if (kvm_has_tsc_control && - user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -5280,9 +6561,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_GET_SUPPORTED_HV_CPUID: r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); break; +#endif #ifdef CONFIG_KVM_XEN case KVM_XEN_VCPU_GET_ATTR: { struct kvm_xen_vcpu_attr xva; @@ -5306,6 +6589,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } #endif case KVM_GET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); r = -ENOMEM; if (!u.sregs2) @@ -5318,6 +6606,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_SREGS2: { + r = -EINVAL; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + goto out; + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); if (IS_ERR(u.sregs2)) { r = PTR_ERR(u.sregs2); @@ -5327,6 +6620,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = __set_sregs2(vcpu, u.sregs2); break; } + case KVM_HAS_DEVICE_ATTR: + case KVM_GET_DEVICE_ATTR: + case KVM_SET_DEVICE_ATTR: + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); + break; + case KVM_MEMORY_ENCRYPT_OP: + r = -ENOTTY; + if (!kvm_x86_ops.vcpu_mem_enc_ioctl) + goto out; + r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp); + break; default: r = -EINVAL; } @@ -5348,14 +6652,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); + ret = kvm_x86_call(set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); + return kvm_x86_call(set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -5373,140 +6677,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) -{ - return kvm->arch.n_max_mmu_pages; -} - -static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, &pic->pics[0], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, &pic->pics[1], - sizeof(struct kvm_pic_state)); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_get_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - return r; -} - -static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) -{ - struct kvm_pic *pic = kvm->arch.vpic; - int r; - - r = 0; - switch (chip->chip_id) { - case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic->lock); - memcpy(&pic->pics[0], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic->lock); - memcpy(&pic->pics[1], &chip->chip.pic, - sizeof(struct kvm_pic_state)); - spin_unlock(&pic->lock); - break; - case KVM_IRQCHIP_IOAPIC: - kvm_set_ioapic(kvm, &chip->chip.ioapic); - break; - default: - r = -EINVAL; - break; - } - kvm_pic_update_irq(pic); - return r; -} - -static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; - - BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); - - mutex_lock(&kps->lock); - memcpy(ps, &kps->channels, sizeof(*ps)); - mutex_unlock(&kps->lock); - return 0; -} - -static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) -{ - int i; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, ps->channels[i].count, 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, - sizeof(ps->channels)); - ps->flags = kvm->arch.vpit->pit_state.flags; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - memset(&ps->reserved, 0, sizeof(ps->reserved)); - return 0; -} - -static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) -{ - int start = 0; - int i; - u32 prev_legacy, cur_legacy; - struct kvm_pit *pit = kvm->arch.vpit; - - mutex_lock(&pit->pit_state.lock); - prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; - cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; - if (!prev_legacy && cur_legacy) - start = 1; - memcpy(&pit->pit_state.channels, &ps->channels, - sizeof(pit->pit_state.channels)); - pit->pit_state.flags = ps->flags; - for (i = 0; i < 3; i++) - kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, - start && i == 0); - mutex_unlock(&pit->pit_state.lock); - return 0; -} - -static int kvm_vm_ioctl_reinject(struct kvm *kvm, - struct kvm_reinject_control *control) -{ - struct kvm_pit *pit = kvm->arch.vpit; - - /* pit->pit_state.lock was overloaded to prevent userspace from getting - * an inconsistent state after running multiple KVM_REINJECT_CONTROL - * ioctls in parallel. Use a separate lock if that ioctl isn't rare. - */ - mutex_lock(&pit->pit_state.lock); - kvm_pit_set_reinject(pit, control->pit_reinject); - mutex_unlock(&pit->pit_state.lock); - - return 0; -} - void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -5517,24 +6687,15 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) * VM-Exit. */ struct kvm_vcpu *vcpu; - int i; + unsigned long i; + + if (!kvm->arch.cpu_dirty_log_size) + return; kvm_for_each_vcpu(i, vcpu, kvm) kvm_vcpu_kick(vcpu); } -int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, - bool line_status) -{ - if (!irqchip_in_kernel(kvm)) - return -ENXIO; - - irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event->irq, irq_event->level, - line_status); - return 0; -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -5544,8 +6705,13 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS2: + r = -EINVAL; + if (cap->args[0] & ~kvm_caps.supported_quirks) + break; + fallthrough; case KVM_CAP_DISABLE_QUIRKS: - kvm->arch.disabled_quirks = cap->args[0]; + kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks; r = 0; break; case KVM_CAP_SPLIT_IRQCHIP: { @@ -5558,13 +6724,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; if (kvm->created_vcpus) goto split_irqchip_unlock; - r = kvm_setup_empty_irq_routing(kvm); - if (r) - goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -5584,19 +6748,26 @@ split_irqchip_unlock: break; case KVM_CAP_X86_DISABLE_EXITS: r = -EINVAL; - if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) + if (cap->args[0] & ~kvm_get_allowed_disable_exits()) break; - if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && - kvm_can_mwait_in_guest()) - kvm->arch.mwait_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) - kvm->arch.hlt_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) - kvm->arch.pause_in_guest = true; - if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) - kvm->arch.cstate_in_guest = true; + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) + goto disable_exits_unlock; + +#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ + "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." + + if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) && + cpu_smt_possible() && + (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_APERFMPERF))) + pr_warn_once(SMT_RSB_MSG); + + kvm_disable_exits(kvm, cap->args[0]); r = 0; +disable_exits_unlock: + mutex_unlock(&kvm->lock); break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; @@ -5606,7 +6777,14 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; case KVM_CAP_X86_USER_SPACE_MSR: + r = -EINVAL; + if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK) + break; kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; @@ -5619,7 +6797,7 @@ split_irqchip_unlock: (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) break; - if (kvm_has_bus_lock_exit && + if (kvm_caps.has_bus_lock_exit && cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) kvm->arch.bus_lock_detection_enabled = true; r = 0; @@ -5643,9 +6821,18 @@ split_irqchip_unlock: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: r = -EINVAL; - if (kvm_x86_ops.vm_copy_enc_context_from) - r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); - return r; + if (!kvm_x86_ops.vm_copy_enc_context_from) + break; + + r = kvm_x86_call(vm_copy_enc_context_from)(kvm, cap->args[0]); + break; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (!kvm_x86_ops.vm_move_enc_context_from) + break; + + r = kvm_x86_call(vm_move_enc_context_from)(kvm, cap->args[0]); + break; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -5661,6 +6848,103 @@ split_irqchip_unlock: kvm->arch.exit_on_emulation_error = cap->args[0]; r = 0; break; + case KVM_CAP_PMU_CAPABILITY: + r = -EINVAL; + if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.bsp_vcpu_id > cap->args[0]) { + ; + } else if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_APIC_BUS_CYCLES_NS: { + u64 bus_cycle_ns = cap->args[0]; + u64 unused; + + /* + * Guard against overflow in tmict_to_ns(). 128 is the highest + * divide value that can be programmed in APIC_TDCR. + */ + r = -EINVAL; + if (!bus_cycle_ns || + check_mul_overflow((u64)U32_MAX * 128, bus_cycle_ns, &unused)) + break; + + r = 0; + mutex_lock(&kvm->lock); + if (!irqchip_in_kernel(kvm)) + r = -ENXIO; + else if (kvm->created_vcpus) + r = -EINVAL; + else + kvm->arch.apic_bus_cycle_ns = bus_cycle_ns; + mutex_unlock(&kvm->lock); + break; + } default: r = -EINVAL; break; @@ -5696,13 +6980,13 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, struct kvm_msr_filter_range *user_range) { - unsigned long *bitmap = NULL; + unsigned long *bitmap; size_t bitmap_size; if (!user_range->nmsrs) return 0; - if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) return -EINVAL; if (!user_range->flags) @@ -5727,23 +7011,22 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, return 0; } -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) { - struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_x86_msr_filter *new_filter, *old_filter; - struct kvm_msr_filter filter; bool default_allow; bool empty = true; - int r = 0; + int r; u32 i; - if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) - return -EFAULT; + if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) + return -EINVAL; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) - empty &= !filter.ranges[i].nmsrs; + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; - default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); if (empty && !default_allow) return -EINVAL; @@ -5751,8 +7034,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (!new_filter) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); if (r) { kvm_free_msr_filter(new_filter); return r; @@ -5760,42 +7043,92 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) } mutex_lock(&kvm->lock); - - /* The per-VM filter is protected by kvm->lock... */ - old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); - - rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, + mutex_is_locked(&kvm->lock)); + mutex_unlock(&kvm->lock); synchronize_srcu(&kvm->srcu); kvm_free_msr_filter(old_filter); - kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); - mutex_unlock(&kvm->lock); + /* + * Recalc MSR intercepts as userspace may want to intercept accesses to + * MSRs that KVM would otherwise pass through to the guest. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS); return 0; } +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif + #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_arch_suspend_notifier(struct kvm *kvm) { struct kvm_vcpu *vcpu; - int i, ret = 0; + unsigned long i; - mutex_lock(&kvm->lock); - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!vcpu->arch.pv_time_enabled) - continue; - - ret = kvm_set_guest_paused(vcpu); - if (ret) { - kvm_err("Failed to pause guest VCPU%d: %d\n", - vcpu->vcpu_id, ret); - break; - } - } - mutex_unlock(&kvm->lock); + /* + * Ignore the return, marking the guest paused only "fails" if the vCPU + * isn't using kvmclock; continuing on is correct and desirable. + */ + kvm_for_each_vcpu(i, vcpu, kvm) + (void)kvm_set_guest_paused(vcpu); - return ret ? NOTIFY_BAD : NOTIFY_DONE; + return NOTIFY_DONE; } int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) @@ -5810,15 +7143,86 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ -long kvm_arch_vm_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data = { 0 }; + + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_raw_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) + return -EINVAL; + + kvm_hv_request_tsc_page_update(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; + else + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + + if (ioctl == KVM_MEMORY_ENCRYPT_OP && + kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl) + return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp); + + return -ENOIOCTLCMD; +} + +int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -ENOTTY; + +#ifdef CONFIG_KVM_IOAPIC /* * This union makes it completely explicit to gcc-3.x - * that these two variables' stack usage should be + * that these three variables' stack usage should be * combined, not added together. */ union { @@ -5826,6 +7230,7 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_pit_state2 ps2; struct kvm_pit_config pit_config; } u; +#endif switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -5849,9 +7254,7 @@ set_identity_unlock: case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); break; - case KVM_GET_NR_MMU_PAGES: - r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); - break; +#ifdef CONFIG_KVM_IOAPIC case KVM_CREATE_IRQCHIP: { mutex_lock(&kvm->lock); @@ -5859,6 +7262,15 @@ set_identity_unlock: if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + /* + * Disallow an in-kernel I/O APIC if the VM has protected EOIs, + * i.e. if KVM can't intercept EOIs and thus can't properly + * emulate level-triggered interrupts. + */ + r = -ENOTTY; + if (kvm->arch.has_protected_eoi) + goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; @@ -5873,7 +7285,7 @@ set_identity_unlock: goto create_irqchip_unlock; } - r = kvm_setup_default_irq_routing(kvm); + r = kvm_setup_default_ioapic_and_pic_routing(kvm); if (r) { kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); @@ -5882,6 +7294,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -5899,6 +7312,9 @@ set_identity_unlock: r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; + r = -ENOENT; + if (!pic_in_kernel(kvm)) + goto create_pit_unlock; r = -ENOMEM; kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags); if (kvm->arch.vpit) @@ -5917,7 +7333,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -5941,7 +7357,7 @@ set_identity_unlock: } r = -ENXIO; - if (!irqchip_kernel(kvm)) + if (!irqchip_full(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); set_irqchip_out: @@ -6014,11 +7430,15 @@ set_pit2_out: r = kvm_vm_ioctl_reinject(kvm, &control); break; } +#endif case KVM_SET_BOOT_CPU_ID: r = 0; mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; + else if (arg > KVM_MAX_VCPU_IDS || + (kvm->arch.max_vcpu_ids && arg > kvm->arch.max_vcpu_ids)) + r = -EINVAL; else kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); @@ -6052,67 +7472,54 @@ set_pit2_out: r = kvm_xen_hvm_set_attr(kvm, &xha); break; } -#endif - case KVM_SET_CLOCK: { - struct kvm_arch *ka = &kvm->arch; - struct kvm_clock_data user_ns; - u64 now_ns; + case KVM_XEN_HVM_EVTCHN_SEND: { + struct kvm_irq_routing_xen_evtchn uxe; r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + if (copy_from_user(&uxe, argp, sizeof(uxe))) goto out; + r = kvm_xen_hvm_evtchn_send(kvm, &uxe); + break; + } +#endif + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); + break; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); + break; + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; r = -EINVAL; - if (user_ns.flags) - goto out; + user_tsc_khz = (u32)arg; - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) + goto out; - /* - * This pairs with kvm_guest_time_update(): when masterclock is - * in use, we use master_kernel_ns + kvmclock_offset to set - * unsigned 'system_time' so if we use get_kvmclock_ns() (which - * is slightly ahead) here we risk going negative on unsigned - * 'system_time' when 'user_ns.clock' is very small. - */ - spin_lock_irq(&ka->pvclock_gtod_sync_lock); - if (kvm->arch.use_master_clock) - now_ns = ka->master_kernel_ns; - else - now_ns = get_kvmclock_base_ns(); - ka->kvmclock_offset = user_ns.clock - now_ns; - spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); - break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); + r = 0; + } + mutex_unlock(&kvm->lock); + goto out; } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; - - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); - - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) - goto out; - r = 0; - break; + case KVM_GET_TSC_KHZ: { + r = READ_ONCE(kvm->arch.default_tsc_khz); + goto out; } - case KVM_MEMORY_ENCRYPT_OP: { + case KVM_MEMORY_ENCRYPT_OP: r = -ENOTTY; - if (kvm_x86_ops.mem_enc_op) - r = static_call(kvm_x86_mem_enc_op)(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = kvm_x86_call(mem_enc_ioctl)(kvm, argp); break; - } case KVM_MEMORY_ENCRYPT_REG_REGION: { struct kvm_enc_region region; @@ -6121,8 +7528,10 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_reg_region) - r = static_call(kvm_x86_mem_enc_reg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = kvm_x86_call(mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -6133,10 +7542,13 @@ set_pit2_out: goto out; r = -ENOTTY; - if (kvm_x86_ops.mem_enc_unreg_region) - r = static_call(kvm_x86_mem_enc_unreg_region)(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = kvm_x86_call(mem_enc_unregister_region)(kvm, ®ion); break; } +#ifdef CONFIG_KVM_HYPERV case KVM_HYPERV_EVENTFD: { struct kvm_hyperv_eventfd hvevfd; @@ -6146,12 +7558,20 @@ set_pit2_out: r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } +#endif case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; - case KVM_X86_SET_MSR_FILTER: - r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; + } default: r = -ENOTTY; } @@ -6159,99 +7579,155 @@ out: return r; } -static void kvm_init_msr_list(void) +static void kvm_probe_feature_msr(u32 msr_index) +{ + u64 data; + + if (kvm_get_feature_msr(NULL, msr_index, &data, true)) + return; + + msr_based_features[num_msr_based_features++] = msr_index; +} + +static void kvm_probe_msr_to_save(u32 msr_index) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i; - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, - "Please update the fixed PMCs in msrs_to_saved_all[]"); + if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) + return; + + /* + * Even MSRs that are valid in the host may not be exposed to guests in + * some cases. + */ + switch (msr_index) { + case MSR_IA32_BNDCFGS: + if (!kvm_mpx_supported()) + return; + break; + case MSR_TSC_AUX: + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + return; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + return; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + return; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + return; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + return; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (msr_index - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)) + return; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... + MSR_ARCH_PERFMON_PERFCTR0 + KVM_MAX_NR_GP_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... + MSR_ARCH_PERFMON_EVENTSEL0 + KVM_MAX_NR_GP_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= + kvm_pmu_cap.num_counters_gp) + return; + break; + case MSR_ARCH_PERFMON_FIXED_CTR0 ... + MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_MAX_NR_FIXED_COUNTERS - 1: + if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= + kvm_pmu_cap.num_counters_fixed) + return; + break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + return; + break; + case MSR_IA32_TSX_CTRL: + if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) + return; + break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; + default: + break; + } + + msrs_to_save[num_msrs_to_save++] = msr_index; +} - perf_get_x86_pmu_capability(&x86_pmu); +static void kvm_init_msr_lists(void) +{ + unsigned i; + + BUILD_BUG_ON_MSG(KVM_MAX_NR_FIXED_COUNTERS != 3, + "Please update the fixed PMCs in msrs_to_save_pmu[]"); num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; - for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { - if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) - continue; - - /* - * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. - */ - switch (msrs_to_save_all[i]) { - case MSR_IA32_BNDCFGS: - if (!kvm_mpx_supported()) - continue; - break; - case MSR_TSC_AUX: - if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && - !kvm_cpu_cap_has(X86_FEATURE_RDPID)) - continue; - break; - case MSR_IA32_UMWAIT_CONTROL: - if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) - continue; - break; - case MSR_IA32_RTIT_CTL: - case MSR_IA32_RTIT_STATUS: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) - continue; - break; - case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) - continue; - break; - case MSR_IA32_RTIT_OUTPUT_BASE: - case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && - !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) - continue; - break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: - if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || - msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= - intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) - continue; - break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) - continue; - break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: - if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) - continue; - break; - default: - break; - } + for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) + kvm_probe_msr_to_save(msrs_to_save_base[i]); - msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; + if (enable_pmu) { + for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) + kvm_probe_msr_to_save(msrs_to_save_pmu[i]); } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) + if (!kvm_x86_call(has_emulated_msr)(NULL, + emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { - struct kvm_msr_entry msr; - - msr.index = msr_based_features_all[i]; - if (kvm_get_msr_feature(&msr)) - continue; + for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) + kvm_probe_feature_msr(i); - msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; - } + for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) + kvm_probe_feature_msr(msr_based_features_all_except_vmx[i]); } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -6297,28 +7773,29 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return handled; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { - static_call(kvm_x86_set_segment)(vcpu, var, seg); + kvm_x86_call(set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - static_call(kvm_x86_get_segment)(vcpu, var, seg); + kvm_x86_call(get_segment)(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.mmu; gpa_t t_gpa; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); return t_gpa; } @@ -6326,50 +7803,48 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); -} -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, - struct x86_exception *exception) -{ - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; - access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } -EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -6392,14 +7867,15 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, - exception); - if (unlikely(gpa == UNMAPPED_GVA)) + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -6417,7 +7893,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -6429,46 +7905,38 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -EXPORT_SYMBOL_GPL(kvm_read_guest_virt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt); static int emulator_read_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception, bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = 0; + u64 access = 0; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, - unsigned long addr, void *val, unsigned int bytes) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes); - - return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE; -} - static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -6489,9 +7957,11 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = PFERR_WRITE_MASK; + u64 access = PFERR_WRITE_MASK; - if (!system && static_call(kvm_x86_get_cpl)(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (kvm_x86_call(get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -6502,34 +7972,46 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { /* kvm_write_guest_virt_system can pull in tons of pages. */ - vcpu->arch.l1tf_flush_l1d = true; + kvm_request_l1tf_flush_l1d(); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } -EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system); + +static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + return kvm_x86_call(check_emulate_instruction)(vcpu, emul_type, + insn, insn_len); +} int handle_ud(struct kvm_vcpu *vcpu) { static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; + int fep_flags = READ_ONCE(force_emulation_prefix); int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + int r; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0))) + r = kvm_check_emulate_insn(vcpu, emul_type, NULL, 0); + if (r != X86EMUL_CONTINUE) return 1; - if (force_emulation_prefix && + if (fep_flags && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { + if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF) + kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF); kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); emul_type = EMULTYPE_TRAP_UD_FORCED; } return kvm_emulate_instruction(vcpu, emul_type); } -EXPORT_SYMBOL_GPL(handle_ud); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud); static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t gpa, bool write) @@ -6550,26 +8032,27 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) - | (write ? PFERR_WRITE_MASK : 0); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); /* * currently PKRU is only applied to ept enabled guest so * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -6749,7 +8232,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, return rc; if (!vcpu->mmio_nr_fragments) - return rc; + return X86EMUL_CONTINUE; gpa = vcpu->mmio_fragments[0].gpa; @@ -6784,15 +8267,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, exception, &write_emultor); } -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +#define emulator_try_cmpxchg_user(t, ptr, old, new) \ + (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t)) static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -6801,12 +8277,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { - struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u64 page_line_mask; + unsigned long hva; gpa_t gpa; - char *kaddr; - bool exchanged; + int r; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -6814,7 +8289,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; @@ -6830,31 +8305,42 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) + hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) goto emul_write; - kaddr = map.hva + offset_in_page(gpa); + hva += offset_in_page(gpa); switch (bytes) { case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + r = emulator_try_cmpxchg_user(u8, hva, old, new); break; case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + r = emulator_try_cmpxchg_user(u16, hva, old, new); break; case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + r = emulator_try_cmpxchg_user(u32, hva, old, new); break; case 8: - exchanged = CMPXCHG64(kaddr, old, new); + r = emulator_try_cmpxchg_user(u64, hva, old, new); break; default: BUG(); } - kvm_vcpu_unmap(vcpu, &map, true); + if (r < 0) + return X86EMUL_UNHANDLEABLE; - if (!exchanged) + /* + * Mark the page dirty _before_ checking whether or not the CMPXCHG was + * successful, as the old value is written back on failure. Note, for + * live migration, this is unnecessarily conservative as CMPXCHG writes + * back the original value and the access is atomic, but KVM's ABI is + * that all writes are dirty logged, regardless of the value written. + */ + kvm_vcpu_mark_page_dirty(vcpu, gpa_to_gfn(gpa)); + + if (r) return X86EMUL_CMPXCHG_FAILED; kvm_page_track_write(vcpu, gpa, new, bytes); @@ -6862,43 +8348,52 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; emul_write: - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); + pr_warn_once("emulating exchange as write\n"); return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; - return 1; - } + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -6906,46 +8401,53 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, unsigned int count) + unsigned short port, void *val, unsigned int count) { - int ret; - - if (vcpu->arch.pio.count) - goto data_avail; - - memset(vcpu->arch.pio_data, 0, size * count); + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; - return 1; - } + return r; +} - return 0; +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned int count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; } static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, unsigned int count) { - return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + if (vcpu->arch.pio.count) { + /* + * Complete a previous iteration that required userspace I/O. + * Note, @count isn't guaranteed to match pio.count as userspace + * can modify ECX before rerunning the vCPU. Ignore any such + * shenanigans as KVM doesn't support modifying the rep count, + * and the emulator ensures @count doesn't overflow the buffer. + */ + complete_emulator_pio_in(vcpu, val); + return 1; + } + return emulator_pio_in(vcpu, size, port, val, count); } static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port, const void *val, unsigned int count) { - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } @@ -6958,7 +8460,7 @@ static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return static_call(kvm_x86_get_segment_base)(vcpu, seg); + return kvm_x86_call(get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6971,12 +8473,11 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (static_call(kvm_x86_has_wbinvd_exit)()) { + if (kvm_x86_call(has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, - wbinvd_ipi, NULL, 1); + wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); } else @@ -6989,7 +8490,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) kvm_emulate_wbinvd_noskip(vcpu); return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd); @@ -6998,10 +8499,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) { - kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return kvm_get_dr(emul_to_vcpu(ctxt), dr); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, @@ -7076,27 +8576,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); + return kvm_x86_call(get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); + kvm_x86_call(set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -7171,56 +8671,70 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, return; } -static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 *pdata) +static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_get_msr(vcpu, msr_index, pdata); + r = kvm_emulate_msr_read(vcpu, msr_index, pdata); + if (r < 0) + return X86EMUL_UNHANDLEABLE; - if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { - /* Bounce to user space */ - return X86EMUL_IO_NEEDED; + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_read_ex(msr_index); + return X86EMUL_PROPAGATE_FAULT; } - return r; + trace_kvm_msr_read(msr_index, *pdata); + return X86EMUL_CONTINUE; } -static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, - u32 msr_index, u64 data) +static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - r = kvm_set_msr(vcpu, msr_index, data); - - if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { - /* Bounce to user space */ - return X86EMUL_IO_NEEDED; - } + r = kvm_emulate_msr_write(vcpu, msr_index, data); + if (r < 0) + return X86EMUL_UNHANDLEABLE; - return r; -} + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) + return X86EMUL_IO_NEEDED; -static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + trace_kvm_msr_write_ex(msr_index, data); + return X86EMUL_PROPAGATE_FAULT; + } - return vcpu->arch.smbase; + trace_kvm_msr_write(msr_index, data); + return X86EMUL_CONTINUE; } -static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); - vcpu->arch.smbase = smbase; + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } -static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, - u32 pmc) +static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7238,8 +8752,8 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, - &ctxt->exception); + return kvm_x86_call(check_intercept)(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, @@ -7249,19 +8763,24 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } -static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) +static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); } -static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) +static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } -static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + +static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt) { - return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); + return guest_cpuid_is_intel_compatible(emul_to_vcpu(ctxt)); } static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) @@ -7276,30 +8795,33 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); + kvm_x86_call(set_nmi_mask)(emul_to_vcpu(ctxt), masked); } -static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) { - return emul_to_vcpu(ctxt)->arch.hflags; + return is_smm(emul_to_vcpu(ctxt)); } -static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) +#ifndef CONFIG_KVM_SMM +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - kvm_smm_changed(vcpu, false); + WARN_ON_ONCE(1); + return X86EMUL_UNHANDLEABLE; } +#endif -static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, - const char *smstate) +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { - return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } -static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) +static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr) { - kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + *xcr = emul_to_vcpu(ctxt)->arch.xcr0; + return 0; } static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) @@ -7307,12 +8829,36 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + +static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + if (!kvm_x86_ops.get_untagged_addr) + return addr; + + return kvm_x86_call(get_untagged_addr)(emul_to_vcpu(ctxt), + addr, flags); +} + +static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt, + gva_t addr, unsigned int flags) +{ + return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, .write_std = emulator_write_std, - .read_phys = kvm_read_guest_phys_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -7332,31 +8878,33 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .get_smbase = emulator_get_smbase, - .set_smbase = emulator_set_smbase, - .set_msr = emulator_set_msr, + .set_msr_with_filter = emulator_set_msr_with_filter, + .get_msr_with_filter = emulator_get_msr_with_filter, .get_msr = emulator_get_msr, - .check_pmc = emulator_check_pmc, + .check_rdpmc_early = emulator_check_rdpmc_early, .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, - .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, + .guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible, .set_nmi_mask = emulator_set_nmi_mask, - .get_hflags = emulator_get_hflags, - .exiting_smm = emulator_exiting_smm, + .is_smm = emulator_is_smm, .leave_smm = emulator_leave_smm, .triple_fault = emulator_triple_fault, + .get_xcr = emulator_get_xcr, .set_xcr = emulator_set_xcr, + .get_untagged_addr = emulator_get_untagged_addr, + .is_canonical_addr = emulator_is_canonical_addr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + u32 int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -7367,24 +8915,23 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); + kvm_x86_call(set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } } -static bool inject_emulated_exception(struct kvm_vcpu *vcpu) +static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); - if (ctxt->exception.error_code_valid) + if (ctxt->exception.vector == PF_VECTOR) + kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); - return false; } static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -7393,7 +8940,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); if (!ctxt) { - pr_err("kvm: failed to allocate vcpu's emulator\n"); + pr_err("failed to allocate vcpu's emulator\n"); return NULL; } @@ -7409,7 +8956,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + kvm_x86_call(get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); @@ -7421,10 +8968,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); - BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); - BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->interruptibility = 0; ctxt->have_exception = false; ctxt->exception.vector = -1; @@ -7454,31 +8997,113 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); } } -EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt); -static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata, u8 *insn_bytes, u8 insn_size) { - struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - u32 insn_size = ctxt->fetch.end - ctxt->fetch.data; struct kvm_run *run = vcpu->run; + u64 info[5]; + u8 info_start; + + /* + * Zero the whole array used to retrieve the exit info, as casting to + * u32 for select entries will leave some chunks uninitialized. + */ + memset(&info, 0, sizeof(info)); + + kvm_x86_call(get_exit_info)(vcpu, (u32 *)&info[0], &info[1], &info[2], + (u32 *)&info[3], (u32 *)&info[4]); run->exit_reason = KVM_EXIT_INTERNAL_ERROR; run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; - run->emulation_failure.ndata = 0; + + /* + * There's currently space for 13 entries, but 5 are used for the exit + * reason and info. Restrict to 4 to reduce the maintenance burden + * when expanding kvm_run.emulation_failure in the future. + */ + if (WARN_ON_ONCE(ndata > 4)) + ndata = 4; + + /* Always include the flags as a 'data' entry. */ + info_start = 1; run->emulation_failure.flags = 0; if (insn_size) { - run->emulation_failure.ndata = 3; + BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + + sizeof(run->emulation_failure.insn_bytes) != 16)); + info_start += 2; run->emulation_failure.flags |= KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; run->emulation_failure.insn_size = insn_size; memset(run->emulation_failure.insn_bytes, 0x90, sizeof(run->emulation_failure.insn_bytes)); - memcpy(run->emulation_failure.insn_bytes, - ctxt->fetch.data, insn_size); + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); } + + memcpy(&run->internal.data[info_start], info, sizeof(info)); + memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, + ndata * sizeof(data[0])); + + run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; } +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data, + ctxt->fetch.end - ctxt->fetch.data); +} + +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata) +{ + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit); + +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit); + +void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + u32 reason, intr_info, error_code; + struct kvm_run *run = vcpu->run; + u64 info1, info2; + int ndata = 0; + + kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2, + &intr_info, &error_code); + + run->internal.data[ndata++] = info2; + run->internal.data[ndata++] = reason; + run->internal.data[ndata++] = info1; + run->internal.data[ndata++] = gpa; + run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + run->internal.ndata = ndata; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit); + +void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason) +{ + vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason); + + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { struct kvm *kvm = vcpu->kvm; @@ -7493,164 +9118,61 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) if (kvm->arch.exit_on_emulation_error || (emulation_type & EMULTYPE_SKIP)) { - prepare_emulation_failure_exit(vcpu); + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (!is_guest_mode(vcpu) && kvm_x86_call(get_cpl)(vcpu) == 0) { + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } return 1; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - bool write_fault_to_shadow_pgtable, - int emulation_type) +static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu, + gpa_t cr2_or_gpa, + int emulation_type) { - gpa_t gpa = cr2_or_gpa; - kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (!vcpu->arch.mmu->direct_map) { - /* - * Write permission should be allowed since only - * write access need to be emulated. - */ - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - /* - * If the mapping is invalid in guest, let cpu retry - * it to generate fault. - */ - if (gpa == UNMAPPED_GVA) - return true; - } - /* - * Do not retry the unhandleable instruction if it faults on the - * readonly host memory, otherwise it will goto a infinite loop: - * retry instruction -> write #PF -> emulation fail -> retry - * instruction -> ... + * If the failed instruction faulted on an access to page tables that + * are used to translate any part of the instruction, KVM can't resolve + * the issue by unprotecting the gfn, as zapping the shadow page will + * result in the instruction taking a !PRESENT page fault and thus put + * the vCPU into an infinite loop of page faults. E.g. KVM will create + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and + * then zap the SPTE to unprotect the gfn, and then do it all over + * again. Report the error to userspace. */ - pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - - /* - * If the instruction failed on the error pfn, it can not be fixed, - * report the error to userspace. - */ - if (is_error_noslot_pfn(pfn)) + if (emulation_type & EMULTYPE_WRITE_PF_TO_SP) return false; - kvm_release_pfn_clean(pfn); - - /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu->direct_map) { - unsigned int indirect_shadow_pages; - - write_lock(&vcpu->kvm->mmu_lock); - indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - write_unlock(&vcpu->kvm->mmu_lock); - - if (indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - - return true; - } - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-enter the - * guest to let CPU execute the instruction. + * If emulation may have been triggered by a write to a shadowed page + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the + * guest to let the CPU re-execute the instruction in the hope that the + * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true); /* - * If the access faults on its page table, it can not - * be fixed by unprotecting shadow page and it should - * be reported to userspace. + * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible + * all SPTEs were already zapped by a different task. The alternative + * is to report the error to userspace and likely terminate the guest, + * and the last_retry_{eip,addr} checks will prevent retrying the page + * fault indefinitely, i.e. there's nothing to lose by retrying. */ - return !write_fault_to_shadow_pgtable; -} - -static bool retry_instruction(struct x86_emulate_ctxt *ctxt, - gpa_t cr2_or_gpa, int emulation_type) -{ - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; - - last_retry_eip = vcpu->arch.last_retry_eip; - last_retry_addr = vcpu->arch.last_retry_addr; - - /* - * If the emulation is caused by #PF and it is non-page_table - * writing instruction, it means the VM-EXIT is caused by shadow - * page protected, we can zap the shadow page and retry this - * instruction directly. - * - * Note: if the guest uses a non-page-table modifying instruction - * on the PDE that points to the instruction, then we will unmap - * the instruction and go to an infinite loop. So, we cache the - * last retried eip and the last fault address, if we meet the eip - * and the address again, we can break out of the potential infinite - * loop. - */ - vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - - if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) - return false; - - if (WARN_ON_ONCE(is_guest_mode(vcpu)) || - WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) - return false; - - if (x86_page_table_writing_insn(ctxt)) - return false; - - if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) - return false; - - vcpu->arch.last_retry_eip = ctxt->eip; - vcpu->arch.last_retry_addr = cr2_or_gpa; - - if (!vcpu->arch.mmu->direct_map) - gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); - - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - return true; } static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) -{ - trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); - - if (entering_smm) { - vcpu->arch.hflags |= HF_SMM_MASK; - } else { - vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); - - /* Process a latched INIT or SMI, if any. */ - kvm_make_request(KVM_REQ_EVENT, vcpu); - } - - kvm_mmu_reset_context(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -7683,13 +9205,15 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); int r; - r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); + r = kvm_x86_call(skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; + kvm_pmu_instruction_retired(vcpu); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -7702,10 +9226,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) r = kvm_vcpu_do_singlestep(vcpu); return r; } -EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { + if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) + return true; + + /* + * Intel compatible CPUs inhibit code #DBs when MOV/POP SS blocking is + * active, but AMD compatible CPUs do not. + */ + if (!guest_cpuid_is_intel_compatible(vcpu)) + return false; + + return kvm_x86_call(get_interrupt_shadow)(vcpu) & KVM_X86_SHADOW_INT_MOV_SS; +} + +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, + int emulation_type, int *r) +{ + WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); + + /* + * Do not check for code breakpoints if hardware has already done the + * checks, as inferred from the emulation type. On NO_DECODE and SKIP, + * the instruction has passed all exception checks, and all intercepted + * exceptions that trigger emulation have lower priority than code + * breakpoints, i.e. the fact that the intercepted exception occurred + * means any code breakpoints have already been serviced. + * + * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as + * hardware has checked the RIP of the magic prefix, but not the RIP of + * the instruction being emulated. The intent of forced emulation is + * to behave as if KVM intercepted the instruction without an exception + * and without a prefix. + */ + if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) + return false; + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { struct kvm_run *kvm_run = vcpu->run; @@ -7725,7 +9285,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && - !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { + !kvm_is_code_breakpoint_inhibited(vcpu)) { unsigned long eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, @@ -7772,26 +9332,41 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt, + int emulation_type) +{ + u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type); + + switch (ctxt->b) { + case 0xcc: + return vector == BP_VECTOR; + case 0xcd: + return vector == ctxt->src.val; + case 0xce: + return vector == OF_VECTOR; + default: + return false; + } +} + /* - * Decode to be emulated instruction. Return EMULATION_OK if success. + * Decode an instruction for emulation. The caller is responsible for handling + * code breakpoints. Note, manually detecting code breakpoints is unnecessary + * (and wrong) when emulating on an intercepted fault-like exception[*], as + * code breakpoints have higher priority and thus have already been done by + * hardware. + * + * [*] Except #MC, which is higher priority, but KVM should never emulate in + * response to a machine check. */ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, void *insn, int insn_len) { - int r = EMULATION_OK; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + int r; init_emulate_ctxt(vcpu); - /* - * We will reenter on the same instruction since we do not set - * complete_userspace_io. This does not handle watchpoints yet, - * those would be handled in the emulate_ops. - */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) - return r; - r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); trace_kvm_emulate_insn_start(vcpu); @@ -7799,7 +9374,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, return r; } -EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) @@ -7807,23 +9382,43 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int r; struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt; - if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len))) - return 1; + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))) + emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF; - vcpu->arch.l1tf_flush_l1d = true; + r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len); + if (r != X86EMUL_CONTINUE) { + if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) + return 1; - /* - * Clear write_fault_to_shadow_pgtable here to ensure it is - * never reused. - */ - write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; - vcpu->arch.write_fault_to_shadow_pgtable = false; + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) + return 1; + + if (r == X86EMUL_UNHANDLEABLE_VECTORING) { + kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa); + return 0; + } + + WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); + return handle_emulation_failure(vcpu, emulation_type); + } + + kvm_request_l1tf_flush_l1d(); if (!(emulation_type & EMULTYPE_NO_DECODE)) { kvm_clear_exception_queue(vcpu); + /* + * Return immediately if RIP hits a code breakpoint, such #DBs + * are fault-like and are higher priority than any faults on + * the code fetch itself. + */ + if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r)) + return r; + r = x86_decode_emulated_instruction(vcpu, emulation_type, insn, insn_len); if (r != EMULATION_OK) { @@ -7832,11 +9427,12 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - if (reexecute_instruction(vcpu, cr2_or_gpa, - write_fault_to_spt, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; - if (ctxt->have_exception) { + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. @@ -7857,18 +9453,41 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks - * for kvm_skip_emulated_instruction(). The caller is responsible for - * updating interruptibility state and injecting single-step #DBs. + * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for + * use *only* by vendor callbacks for kvm_skip_emulated_instruction(). + * The caller is responsible for updating interruptibility state and + * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, ctxt->_eip); + if (emulation_type & EMULTYPE_SKIP_SOFT_INT && + !is_soft_int_instruction(ctxt, emulation_type)) + return 0; + + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; + else + ctxt->eip = ctxt->_eip; + + if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) { + r = 1; + goto writeback; + } + + kvm_rip_write(vcpu, ctxt->eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return 1; } - if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) + /* + * If emulation was caused by a write-protection #PF on a non-page_table + * writing instruction, try to unprotect the gfn, i.e. zap shadow pages, + * and retry the instruction, as the vCPU is likely no longer using the + * gfn as a page table. + */ + if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) && + !x86_page_table_writing_insn(ctxt) && + kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa)) return 1; /* this is needed for vmware backdoor interface to work since it @@ -7884,7 +9503,7 @@ restart: ctxt->exception.address = cr2_or_gpa; /* With shadow page tables, cr2 contains a GVA or nGPA. */ - if (vcpu->arch.mmu->direct_map) { + if (vcpu->arch.mmu->root_role.direct) { ctxt->gpa_available = true; ctxt->gpa_val = cr2_or_gpa; } @@ -7893,23 +9512,31 @@ restart: ctxt->exception.address = 0; } - r = x86_emulate_insn(ctxt); + /* + * Check L1's instruction intercepts when emulating instructions for + * L2, unless KVM is re-emulating a previously decoded instruction, + * e.g. to complete userspace I/O, in which case KVM has already + * checked the intercepts. + */ + r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) && + !(emulation_type & EMULTYPE_NO_DECODE)); if (r == EMULATION_INTERCEPTED) return 1; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, - emulation_type)) + if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa, + emulation_type)) return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { + WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); + vcpu->mmio_needed = false; r = 1; - if (inject_emulated_exception(vcpu)) - return r; + inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -7926,22 +9553,34 @@ restart: writeback = false; r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; + } else if (vcpu->arch.complete_userspace_io) { + writeback = false; + r = 0; } else if (r == EMULATION_RESTART) goto restart; else r = 1; +writeback: if (writeback) { - unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); + unsigned long rflags = kvm_x86_call(get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + + /* + * Note, EXCPT_DB is assumed to be fault-like as the emulator + * only supports code breakpoints and general detect #DB, both + * of which are fault-like. + */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_instruction_retired(vcpu); + if (ctxt->is_branch) + kvm_pmu_branch_retired(vcpu); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops.update_emulated_instruction) - static_call(kvm_x86_update_emulated_instruction)(vcpu); + kvm_x86_call(update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -7963,14 +9602,14 @@ int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) { return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len) { return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len); } -EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer); static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) { @@ -7982,7 +9621,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) return 1; return kvm_skip_emulated_instruction(vcpu); @@ -8007,7 +9646,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, complete_fast_pio_out_port_0x7e; kvm_skip_emulated_instruction(vcpu); } else { - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } return 0; @@ -8020,7 +9659,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* We should only ever be called with arch.pio.count equal to 1 */ BUG_ON(vcpu->arch.pio.count != 1); - if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { + if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) { vcpu->arch.pio.count = 0; return 1; } @@ -8028,11 +9667,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform - * the copy and tracing - */ - emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -8053,7 +9688,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, return ret; } - vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); + vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_in; return 0; @@ -8069,7 +9704,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) ret = kvm_fast_pio_out(vcpu, size, port); return ret && kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_fast_pio); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio); static int kvmclock_cpu_down_prep(unsigned int cpu) { @@ -8080,11 +9715,13 @@ static int kvmclock_cpu_down_prep(unsigned int cpu) static void tsc_khz_changed(void *data) { struct cpufreq_freqs *freq = data; - unsigned long khz = 0; + unsigned long khz; + + WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); if (data) khz = freq->new; - else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + else khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; @@ -8095,34 +9732,28 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; - unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ - for_each_present_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + } + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { - struct kvm_arch *ka = &kvm->arch; - - spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); + __kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); - spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -8131,7 +9762,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; + int send_ipi = 0; + unsigned long i; /* * We allow guests to temporarily run on slowing clocks, @@ -8232,81 +9864,36 @@ static int kvmclock_cpu_online(unsigned int cpu) static void kvm_timer_init(void) { - max_tsc_khz = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy *policy; - int cpu; - - cpu = get_cpu(); - policy = cpufreq_cpu_get(cpu); - if (policy) { - if (policy->cpuinfo.max_freq) - max_tsc_khz = policy->cpuinfo.max_freq; - cpufreq_cpu_put(policy); + max_tsc_khz = tsc_khz; + + if (IS_ENABLED(CONFIG_CPU_FREQ)) { + struct cpufreq_policy *policy; + int cpu; + + cpu = get_cpu(); + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } + put_cpu(); } - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - } - - cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", - kvmclock_cpu_online, kvmclock_cpu_down_prep); -} - -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = static_call(kvm_x86_get_cpl)(__this_cpu_read(current_vcpu)); - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static void kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", + kvmclock_cpu_online, kvmclock_cpu_down_prep); + } } -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, - .handle_intel_pt_intr = kvm_handle_intel_pt_intr, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + unsigned long i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -8357,26 +9944,59 @@ static struct notifier_block pvclock_gtod_notifier = { }; #endif -int kvm_arch_init(void *opaque) +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) { - struct kvm_x86_init_ops *ops = opaque; - int r; + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - if (kvm_x86_ops.hardware_enable) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - r = -EEXIST; - goto out; - } +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP - if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support\n"); - r = -EOPNOTSUPP; - goto out; - } - if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: disabled by bios\n"); - r = -EOPNOTSUPP; - goto out; + kvm_pmu_ops_update(ops->pmu_ops); +} + +static int kvm_x86_check_processor_compatibility(void) +{ + int cpu = smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* + * Compatibility checks are done when loading KVM and when enabling + * hardware, e.g. during CPU hotplug, to ensure all online CPUs are + * compatible, i.e. KVM should never perform a compatibility check on + * an offline CPU. + */ + WARN_ON(!cpu_online(cpu)); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return kvm_x86_call(check_processor_compatibility)(); +} + +static void kvm_x86_check_cpu_compat(void *ret) +{ + *(int *)ret = kvm_x86_check_processor_compatibility(); +} + +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +{ + u64 host_pat; + int r, cpu; + + guard(mutex)(&vendor_module_lock); + + if (kvm_x86_ops.enable_virtualization_cpu) { + pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); + return -EEXIST; } /* @@ -8385,48 +10005,101 @@ int kvm_arch_init(void *opaque) * vCPU's FPU state as a fxregs_state struct. */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { - printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + pr_err("inadequate fpu\n"); + return -EOPNOTSUPP; } - r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + return -EOPNOTSUPP; } + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("host PAT[0] is not WB\n"); + return -EIO; + } + + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); + /* + * Linux doesn't yet support supervisor shadow stacks (SSS), so + * KVM doesn't save/restore the associated MSRs, i.e. KVM may + * clobber the host values. Yell and refuse to load if SSS is + * unexpectedly enabled, e.g. to avoid crashing the host. + */ + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) + return -EIO; + } + + memset(&kvm_caps, 0, sizeof(kvm_caps)); + x86_emulator_cache = kvm_alloc_emulator_cache(); if (!x86_emulator_cache) { - pr_err("kvm: failed to allocate cache for x86 emulator\n"); - goto out_free_x86_fpu_cache; + pr_err("failed to allocate cache for x86 emulator\n"); + return -ENOMEM; } - user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); - if (!user_return_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = kvm_mmu_vendor_module_init(); + if (r) goto out_free_x86_emulator_cache; + + kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM); + kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P; + + if (boot_cpu_has(X86_FEATURE_XSAVE)) { + kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } - kvm_nr_uret_msrs = 0; - r = kvm_mmu_module_init(); - if (r) - goto out_free_percpu; + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + rdmsrq(MSR_IA32_XSS, kvm_host.xss); + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; + } - kvm_timer_init(); + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; + kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; - perf_register_guest_info_callbacks(&kvm_guest_cbs); + rdmsrq_safe(MSR_EFER, &kvm_host.efer); - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); - supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + kvm_init_pmu_capability(ops->pmu_ops); + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities); + + WARN_ON_ONCE(kvm_nr_uret_msrs); + + r = ops->hardware_setup(); + if (r != 0) + goto out_mmu_exit; + + enable_device_posted_irqs &= enable_apicv && + irq_remapping_cap(IRQ_POSTING_CAP); + + kvm_ops_update(ops); + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1); + if (r < 0) + goto out_unwind_ops; } + /* + * Point of no return! DO NOT add error paths below this point unless + * absolutely necessary, as most operations from this point forward + * require unwinding. + */ + kvm_timer_init(); + if (pi_inject_timer == -1) - pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); + pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); @@ -8434,83 +10107,88 @@ int kvm_arch_init(void *opaque) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) + kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); + + /* KVM always ignores guest PAT for shadow paging. */ + if (!tdp_enabled) + kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT; + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; + + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } + + if (kvm_caps.has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; + } + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; + kvm_init_msr_lists(); return 0; -out_free_percpu: - free_percpu(user_return_msrs); +out_unwind_ops: + kvm_x86_ops.enable_virtualization_cpu = NULL; + kvm_x86_call(hardware_unsetup)(); +out_mmu_exit: + kvm_destroy_user_return_msrs(); + kvm_mmu_vendor_module_exit(); out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); -out: return r; } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init); -void kvm_arch_exit(void) +void kvm_x86_vendor_exit(void) { + kvm_unregister_perf_callbacks(); + #ifdef CONFIG_X86_64 if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); + } #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops.hardware_enable = NULL; - kvm_mmu_module_exit(); - free_percpu(user_return_msrs); + kvm_x86_call(hardware_unsetup)(); + kvm_destroy_user_return_msrs(); + kvm_mmu_vendor_module_exit(); kmem_cache_destroy(x86_emulator_cache); - kmem_cache_destroy(x86_fpu_cache); #ifdef CONFIG_KVM_XEN static_key_deferred_flush(&kvm_xen_enabled); WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif + mutex_lock(&vendor_module_lock); + kvm_x86_ops.enable_virtualization_cpu = NULL; + mutex_unlock(&vendor_module_lock); } - -static int __kvm_vcpu_halt(struct kvm_vcpu *vcpu, int state, int reason) -{ - ++vcpu->stat.halt_exits; - if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = state; - return 1; - } else { - vcpu->run->exit_reason = reason; - return 0; - } -} - -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) -{ - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); -} -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); - -int kvm_emulate_halt(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - /* - * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered - * KVM_EXIT_DEBUG here. - */ - return kvm_vcpu_halt(vcpu) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_halt); - -int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) -{ - int ret = kvm_skip_emulated_instruction(vcpu); - - return __kvm_vcpu_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, KVM_EXIT_AP_RESET_HOLD) && ret; -} -EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit); #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, @@ -8524,6 +10202,13 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) return -KVM_EOPNOTSUPP; + /* + * When tsc is in permanent catchup mode guests won't be able to use + * pvclock_read_retry loop to get consistent view of pvclock + */ + if (vcpu->arch.tsc_always_catchup) + return -KVM_EOPNOTSUPP; + if (!kvm_get_walltime_and_clockread(&ts, &cycle)) return -KVM_EOPNOTSUPP; @@ -8547,17 +10232,19 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, * * @apicid - apicid of vcpu to be kicked. */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } @@ -8565,16 +10252,41 @@ bool kvm_apicv_activated(struct kvm *kvm) { return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); } -EXPORT_SYMBOL_GPL(kvm_apicv_activated); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated); -static void kvm_apicv_init(struct kvm *kvm) +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) +{ + ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); + ulong vcpu_reasons = + kvm_x86_call(vcpu_get_apicv_inhibit_reasons)(vcpu); + + return (vm_reasons | vcpu_reasons) == 0; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated); + +static void set_or_clear_apicv_inhibit(unsigned long *inhibits, + enum kvm_apicv_inhibit reason, bool set) { - if (enable_apicv) - clear_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + const struct trace_print_flags apicv_inhibits[] = { APICV_INHIBIT_REASONS }; + + BUILD_BUG_ON(ARRAY_SIZE(apicv_inhibits) != NR_APICV_INHIBIT_REASONS); + + if (set) + __set_bit(reason, inhibits); else - set_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + __clear_bit(reason, inhibits); + + trace_kvm_apicv_inhibit_changed(reason, set, *inhibits); +} + +static void kvm_apicv_init(struct kvm *kvm) +{ + enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT : + APICV_INHIBIT_REASON_DISABLED; + + set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true); + + init_rwsem(&kvm->arch.apicv_update_lock); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) @@ -8590,8 +10302,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) rcu_read_lock(); map = rcu_dereference(vcpu->kvm->arch.apic_map); - if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) - target = map->phys_map[dest_id]->vcpu; + if (likely(map) && dest_id <= map->max_apic_id) { + dest_id = array_index_nospec(dest_id, map->max_apic_id + 1); + if (map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + } rcu_read_unlock(); @@ -8615,33 +10330,27 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) { u64 ret = vcpu->run->hypercall.ret; - if (!is_64_bit_mode(vcpu)) + if (!is_64_bit_hypercall(vcpu)) ret = (u32)ret; kvm_rax_write(vcpu, ret); - ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); } -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, + int (*complete_hypercall)(struct kvm_vcpu *)) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; + unsigned long ret; + unsigned long nr = kvm_rax_read(vcpu); + unsigned long a0 = kvm_rbx_read(vcpu); + unsigned long a1 = kvm_rcx_read(vcpu); + unsigned long a2 = kvm_rdx_read(vcpu); + unsigned long a3 = kvm_rsi_read(vcpu); + int op_64_bit = is_64_bit_hypercall(vcpu); - if (kvm_xen_hypercall_enabled(vcpu->kvm)) - return kvm_xen_hypercall(vcpu); - - if (kvm_hv_hypercall_enabled(vcpu)) - return kvm_hv_hypercall(vcpu); - - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); + ++vcpu->stat.hypercalls; trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_mode(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -8650,7 +10359,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + if (cpl) { ret = -KVM_EPERM; goto out; } @@ -8665,7 +10374,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) break; - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_pv_kick_cpu_op(vcpu->kvm, a1); kvm_sched_yield(vcpu, a1); ret = 0; break; @@ -8691,7 +10400,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) u64 gpa = a0, npages = a1, attrs = a2; ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) break; if (!PAGE_ALIGNED(gpa) || !npages || @@ -8702,26 +10411,47 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + /* + * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2) + * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that + * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting + * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU. + */ + vcpu->run->hypercall.ret = 0; vcpu->run->hypercall.args[0] = gpa; vcpu->run->hypercall.args[1] = npages; vcpu->run->hypercall.args[2] = attrs; - vcpu->run->hypercall.longmode = op_64_bit; - vcpu->arch.complete_userspace_io = complete_hypercall_exit; + vcpu->run->hypercall.flags = 0; + if (op_64_bit) + vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; + + WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); + vcpu->arch.complete_userspace_io = complete_hypercall; return 0; } default: ret = -KVM_ENOSYS; break; } + out: - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); + vcpu->run->hypercall.ret = ret; + return 1; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall); - ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu), + complete_hypercall_exit); } -EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall); static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { @@ -8729,7 +10459,18 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - static_call(kvm_x86_patch_hypercall)(vcpu, instruction); + /* + * If the quirk is disabled, synthesize a #UD and let the guest pick up + * the pieces. + */ + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { + ctxt->exception.error_code_valid = false; + ctxt->exception.vector = UD_VECTOR; + ctxt->have_exception = true; + return X86EMUL_PROPAGATE_FAULT; + } + + kvm_x86_call(patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -8741,26 +10482,23 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) likely(!pic_in_kernel(vcpu->kvm)); } +/* Called within kvm->srcu read side. */ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - /* - * if_flag is obsolete and useless, so do not bother - * setting it for SEV-ES guests. Userspace can just - * use kvm_run->ready_for_interrupt_injection. - */ - kvm_run->if_flag = !vcpu->arch.guest_state_protected - && (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - + kvm_run->if_flag = kvm_x86_call(get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); - kvm_run->apic_base = kvm_get_apic_base(vcpu); + kvm_run->apic_base = vcpu->arch.apic_base; + kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); if (is_smm(vcpu)) kvm_run->flags |= KVM_RUN_X86_SMM; + if (is_guest_mode(vcpu)) + kvm_run->flags |= KVM_RUN_X86_GUEST_MODE; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) @@ -8773,7 +10511,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -8786,13 +10524,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); + kvm_x86_call(update_cr8_intercept)(vcpu, tpr, max_irr); } int kvm_check_nested_events(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { kvm_x86_ops.nested_ops->triple_fault(vcpu); return 1; } @@ -8802,76 +10540,162 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) - vcpu->arch.exception.error_code = false; - static_call(kvm_x86_queue_exception)(vcpu); + /* + * Suppress the error code if the vCPU is in Real Mode, as Real Mode + * exceptions don't report error codes. The presence of an error code + * is carried with the exception and only stripped when the exception + * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do + * report an error code despite the CPU being in Real Mode. + */ + vcpu->arch.exception.has_error_code &= is_protmode(vcpu); + + trace_kvm_inj_exception(vcpu->arch.exception.vector, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + + kvm_x86_call(inject_exception)(vcpu); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) +/* + * Check for any event (interrupt or exception) that is ready to be injected, + * and if there is at least one event, inject the event with the highest + * priority. This handles both "pending" events, i.e. events that have never + * been injected into the guest, and "injected" events, i.e. events that were + * injected as part of a previous VM-Enter, but weren't successfully delivered + * and need to be re-injected. + * + * Note, this is not guaranteed to be invoked on a guest instruction boundary, + * i.e. doesn't guarantee that there's an event window in the guest. KVM must + * be able to inject exceptions in the "middle" of an instruction, and so must + * also be able to re-inject NMIs and IRQs in the middle of an instruction. + * I.e. for exceptions and re-injected events, NOT invoking this on instruction + * boundaries is necessary and correct. + * + * For simplicity, KVM uses a single path to inject all events (except events + * that are injected directly from L1 to L2) and doesn't explicitly track + * instruction boundaries for asynchronous events. However, because VM-Exits + * that can occur during instruction execution typically result in KVM skipping + * the instruction or injecting an exception, e.g. instruction and exception + * intercepts, and because pending exceptions have higher priority than pending + * interrupts, KVM still honors instruction boundaries in most scenarios. + * + * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip + * the instruction or inject an exception, then KVM can incorrecty inject a new + * asynchronous event if the event became pending after the CPU fetched the + * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation) + * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be + * injected on the restarted instruction instead of being deferred until the + * instruction completes. + * + * In practice, this virtualization hole is unlikely to be observed by the + * guest, and even less likely to cause functional problems. To detect the + * hole, the guest would have to trigger an event on a side effect of an early + * phase of instruction execution, e.g. on the instruction fetch from memory. + * And for it to be a functional problem, the guest would need to depend on the + * ordering between that side effect, the instruction completing, _and_ the + * delivery of the asynchronous event. + */ +static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, + bool *req_immediate_exit) { + bool can_inject; int r; - bool can_inject = true; - /* try to reinject previous events if any */ + /* + * Process nested events first, as nested VM-Exit supersedes event + * re-injection. If there's an event queued for re-injection, it will + * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + r = kvm_check_nested_events(vcpu); + else + r = 0; - if (vcpu->arch.exception.injected) { - kvm_inject_exception(vcpu); - can_inject = false; - } /* - * Do not inject an NMI or interrupt if there is a pending - * exception. Exceptions and interrupts are recognized at - * instruction boundaries, i.e. the start of an instruction. - * Trap-like exceptions, e.g. #DB, have higher priority than - * NMIs and interrupts, i.e. traps are recognized before an - * NMI/interrupt that's pending on the same instruction. - * Fault-like exceptions, e.g. #GP and #PF, are the lowest - * priority, but are only generated (pended) during instruction - * execution, i.e. a pending fault-like exception means the - * fault occurred on the *previous* instruction and must be - * serviced prior to recognizing any new events in order to - * fully complete the previous instruction. - */ - else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) { - static_call(kvm_x86_set_nmi)(vcpu); - can_inject = false; - } else if (vcpu->arch.interrupt.injected) { - static_call(kvm_x86_set_irq)(vcpu); - can_inject = false; - } - } + * Re-inject exceptions and events *especially* if immediate entry+exit + * to/from L2 is needed, as any event that has already been injected + * into L2 needs to complete its lifecycle before injecting a new event. + * + * Don't re-inject an NMI or interrupt if there is a pending exception. + * This collision arises if an exception occurred while vectoring the + * injected event, KVM intercepted said exception, and KVM ultimately + * determined the fault belongs to the guest and queues the exception + * for injection back into the guest. + * + * "Injected" interrupts can also collide with pending exceptions if + * userspace ignores the "ready for injection" flag and blindly queues + * an interrupt. In that case, prioritizing the exception is correct, + * as the exception "occurred" before the exit to userspace. Trap-like + * exceptions, e.g. most #DBs, have higher priority than interrupts. + * And while fault-like exceptions, e.g. #GP and #PF, are the lowest + * priority, they're only generated (pended) during instruction + * execution, and interrupts are recognized at instruction boundaries. + * Thus a pending fault-like exception means the fault occurred on the + * *previous* instruction and must be serviced prior to recognizing any + * new events in order to fully complete the previous instruction. + */ + if (vcpu->arch.exception.injected) + kvm_inject_exception(vcpu); + else if (kvm_is_exception_pending(vcpu)) + ; /* see above */ + else if (vcpu->arch.nmi_injected) + kvm_x86_call(inject_nmi)(vcpu); + else if (vcpu->arch.interrupt.injected) + kvm_x86_call(inject_irq)(vcpu, true); + /* + * Exceptions that morph to VM-Exits are handled above, and pending + * exceptions on top of injected exceptions that do not VM-Exit should + * either morph to #DF or, sadly, override the injected exception. + */ WARN_ON_ONCE(vcpu->arch.exception.injected && vcpu->arch.exception.pending); /* - * Call check_nested_events() even if we reinjected a previous event - * in order for caller to determine if it should require immediate-exit - * from L2 to L1 due to pending L1 events which require exit - * from L2 to L1. + * Bail if immediate entry+exit to/from the guest is needed to complete + * nested VM-Enter or event re-injection so that a different pending + * event can be serviced (or if KVM needs to exit to userspace). + * + * Otherwise, continue processing events even if VM-Exit occurred. The + * VM-Exit will have cleared exceptions that were meant for L2, but + * there may now be events that can be injected into L1. */ - if (is_guest_mode(vcpu)) { - r = kvm_check_nested_events(vcpu); - if (r < 0) - goto out; - } + if (r < 0) + goto out; - /* try to inject new event if pending */ - if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + /* + * A pending exception VM-Exit should either result in nested VM-Exit + * or force an immediate re-entry and exit to/from L2, and exception + * VM-Exits cannot be injected (flag should _never_ be set). + */ + WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || + vcpu->arch.exception_vmexit.pending); - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; + /* + * New events, other than exceptions, cannot be injected if KVM needs + * to re-inject a previous event. See above comments on re-injecting + * for why pending exceptions get priority. + */ + can_inject = !kvm_event_needs_reinjection(vcpu); - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + if (vcpu->arch.exception.pending) { + /* + * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS + * value pushed on the stack. Trap-like exception and all #DBs + * leave RF as-is (KVM follows Intel's behavior in this regard; + * AMD states that code breakpoint #DBs excplitly clear RF=0). + * + * Note, most versions of Intel's SDM and AMD's APM incorrectly + * describe the behavior of General Detect #DBs, which are + * fault-like. They do _not_ set RF, a la code breakpoints. + */ + if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR) { - kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.exception.vector == DB_VECTOR) { + kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; kvm_update_dr7(vcpu); @@ -8879,9 +10703,17 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit @@ -8893,8 +10725,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) * in order to make progress and get back here for another iteration. * The kvm_x86_ops hooks communicate this by returning -EBUSY. */ +#ifdef CONFIG_KVM_SMM if (vcpu->arch.smi_pending) { - r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(smi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { @@ -8903,43 +10737,63 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) enter_smm(vcpu); can_inject = false; } else - static_call(kvm_x86_enable_smi_window)(vcpu); + kvm_x86_call(enable_smi_window)(vcpu); } +#endif if (vcpu->arch.nmi_pending) { - r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(nmi_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - static_call(kvm_x86_set_nmi)(vcpu); + kvm_x86_call(inject_nmi)(vcpu); can_inject = false; - WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); + WARN_ON(kvm_x86_call(nmi_allowed)(vcpu, true) < 0); } if (vcpu->arch.nmi_pending) - static_call(kvm_x86_enable_nmi_window)(vcpu); + kvm_x86_call(enable_nmi_window)(vcpu); } if (kvm_cpu_has_injectable_intr(vcpu)) { - r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; + r = can_inject ? kvm_x86_call(interrupt_allowed)(vcpu, true) : + -EBUSY; if (r < 0) goto out; if (r) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - static_call(kvm_x86_set_irq)(vcpu); - WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); + int irq = kvm_cpu_get_interrupt(vcpu); + + if (!WARN_ON_ONCE(irq == -1)) { + kvm_queue_interrupt(vcpu, irq, false); + kvm_x86_call(inject_irq)(vcpu, false); + WARN_ON(kvm_x86_call(interrupt_allowed)(vcpu, true) < 0); + } } if (kvm_cpu_has_injectable_intr(vcpu)) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); } if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu, true)) *req_immediate_exit = true; - WARN_ON(vcpu->arch.exception.pending); + /* + * KVM must never queue a new exception while injecting an event; KVM + * is done emulating and should only propagate the to-be-injected event + * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an + * infinite loop as KVM will bail from VM-Enter to inject the pending + * exception and start the cycle all over. + * + * Exempt triple faults as they have special handling and won't put the + * vCPU into an infinite loop. Triple fault can be queued when running + * VMX without unrestricted guest, as that requires KVM to emulate Real + * Mode events (see kvm_inject_realmode_interrupt()). + */ + WARN_ON_ONCE(vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending); return 0; out: @@ -8952,272 +10806,52 @@ out: static void process_nmi(struct kvm_vcpu *vcpu) { - unsigned limit = 2; + unsigned int limit; /* - * x86 is limited to one NMI running, and one NMI pending after it. - * If an NMI is already in progress, limit further NMIs to just one. - * Otherwise, allow two (and we'll inject the first one immediately). + * x86 is limited to one NMI pending, but because KVM can't react to + * incoming NMIs as quickly as bare metal, e.g. if the vCPU is + * scheduled out, KVM needs to play nice with two queued NMIs showing + * up at the same time. To handle this scenario, allow two NMIs to be + * (temporarily) pending so long as NMIs are not blocked and KVM is not + * waiting for a previous NMI injection to complete (which effectively + * blocks NMIs). KVM will immediately inject one of the two NMIs, and + * will request an NMI window to handle the second NMI. */ - if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_call(get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; - - vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); - vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); - kvm_make_request(KVM_REQ_EVENT, vcpu); -} - -static u32 enter_smm_get_segment_flags(struct kvm_segment *seg) -{ - u32 flags = 0; - flags |= seg->g << 23; - flags |= seg->db << 22; - flags |= seg->l << 21; - flags |= seg->avl << 20; - flags |= seg->present << 15; - flags |= seg->dpl << 13; - flags |= seg->s << 12; - flags |= seg->type << 8; - return flags; -} - -static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - - kvm_get_segment(vcpu, &seg, n); - put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); - - if (n < 3) - offset = 0x7f84 + n * 12; else - offset = 0x7f2c + (n - 3) * 12; - - put_smstate(u32, buf, offset + 8, seg.base); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg)); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) -{ - struct kvm_segment seg; - int offset; - u16 flags; - - kvm_get_segment(vcpu, &seg, n); - offset = 0x7e00 + n * 16; - - flags = enter_smm_get_segment_flags(&seg) >> 8; - put_smstate(u16, buf, offset, seg.selector); - put_smstate(u16, buf, offset + 2, flags); - put_smstate(u32, buf, offset + 4, seg.limit); - put_smstate(u64, buf, offset + 8, seg.base); -} -#endif - -static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); - put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); - put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); - put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); - - for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u32, buf, 0x7fcc, (u32)val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u32, buf, 0x7fc8, (u32)val); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u32, buf, 0x7fc4, seg.selector); - put_smstate(u32, buf, 0x7f64, seg.base); - put_smstate(u32, buf, 0x7f60, seg.limit); - put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg)); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u32, buf, 0x7fc0, seg.selector); - put_smstate(u32, buf, 0x7f80, seg.base); - put_smstate(u32, buf, 0x7f7c, seg.limit); - put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f74, dt.address); - put_smstate(u32, buf, 0x7f70, dt.size); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7f58, dt.address); - put_smstate(u32, buf, 0x7f54, dt.size); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_32(vcpu, buf, i); - - put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020000); - put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); -} - -#ifdef CONFIG_X86_64 -static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) -{ - struct desc_ptr dt; - struct kvm_segment seg; - unsigned long val; - int i; - - for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); - - put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); - put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); - - kvm_get_dr(vcpu, 6, &val); - put_smstate(u64, buf, 0x7f68, val); - kvm_get_dr(vcpu, 7, &val); - put_smstate(u64, buf, 0x7f60, val); - - put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); - put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); - put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); - - put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); - - /* revision id */ - put_smstate(u32, buf, 0x7efc, 0x00020064); - - put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); - put_smstate(u16, buf, 0x7e90, seg.selector); - put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e94, seg.limit); - put_smstate(u64, buf, 0x7e98, seg.base); - - static_call(kvm_x86_get_idt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e84, dt.size); - put_smstate(u64, buf, 0x7e88, dt.address); - - kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); - put_smstate(u16, buf, 0x7e70, seg.selector); - put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8); - put_smstate(u32, buf, 0x7e74, seg.limit); - put_smstate(u64, buf, 0x7e78, seg.base); - - static_call(kvm_x86_get_gdt)(vcpu, &dt); - put_smstate(u32, buf, 0x7e64, dt.size); - put_smstate(u64, buf, 0x7e68, dt.address); - - for (i = 0; i < 6; i++) - enter_smm_save_seg_64(vcpu, buf, i); -} -#endif - -static void enter_smm(struct kvm_vcpu *vcpu) -{ - struct kvm_segment cs, ds; - struct desc_ptr dt; - unsigned long cr0; - char buf[512]; - - memset(buf, 0, 512); -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - enter_smm_save_state_64(vcpu, buf); - else -#endif - enter_smm_save_state_32(vcpu, buf); + limit = 2; /* - * Give enter_smm() a chance to make ISA-specific changes to the vCPU - * state (e.g. leave guest mode) after we've saved the state into the - * SMM state-save area. + * Adjust the limit to account for pending virtual NMIs, which aren't + * tracked in vcpu->arch.nmi_pending. */ - static_call(kvm_x86_enter_smm)(vcpu, buf); - - kvm_smm_changed(vcpu, true); - kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - - if (static_call(kvm_x86_get_nmi_mask)(vcpu)) - vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; - else - static_call(kvm_x86_set_nmi_mask)(vcpu, true); - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0x8000); - - cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - static_call(kvm_x86_set_cr0)(vcpu, cr0); - vcpu->arch.cr0 = cr0; - - static_call(kvm_x86_set_cr4)(vcpu, 0); - - /* Undocumented: IDT limit is set to zero on entry to SMM. */ - dt.address = dt.size = 0; - static_call(kvm_x86_set_idt)(vcpu, &dt); - - kvm_set_dr(vcpu, 7, DR7_FIXED_1); - - cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; - cs.base = vcpu->arch.smbase; + if (kvm_x86_call(is_vnmi_pending)(vcpu)) + limit--; - ds.selector = 0; - ds.base = 0; - - cs.limit = ds.limit = 0xffffffff; - cs.type = ds.type = 0x3; - cs.dpl = ds.dpl = 0; - cs.db = ds.db = 0; - cs.s = ds.s = 1; - cs.l = ds.l = 0; - cs.g = ds.g = 1; - cs.avl = ds.avl = 0; - cs.present = ds.present = 1; - cs.unusable = ds.unusable = 0; - cs.padding = ds.padding = 0; - - kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); - kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); - kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); -#ifdef CONFIG_X86_64 - if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - static_call(kvm_x86_set_efer)(vcpu, 0); -#endif + if (vcpu->arch.nmi_pending && + (kvm_x86_call(set_vnmi_pending)(vcpu))) + vcpu->arch.nmi_pending--; - kvm_update_cpuid_runtime(vcpu); - kvm_mmu_reset_context(vcpu); + if (vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); } -static void process_smi(struct kvm_vcpu *vcpu) +/* Return total number of NMIs pending injection to the VM */ +int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) { - vcpu->arch.smi_pending = true; - kvm_make_request(KVM_REQ_EVENT, vcpu); + return vcpu->arch.nmi_pending + + kvm_x86_call(is_vnmi_pending)(vcpu); } void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - NULL, vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -9225,73 +10859,118 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } -void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + down_read(&vcpu->kvm->arch.apicv_update_lock); + preempt_disable(); + + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); + + if (apic->apicv_active == activate) + goto out; + + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); - static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + kvm_x86_call(refresh_apicv_exec_ctrl)(vcpu); /* * When APICv gets disabled, we may still have injected interrupts * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was * still active when the interrupt got accepted. Make sure - * inject_pending_event() is called to check for that. + * kvm_check_and_inject_events() is called to check for that. */ - if (!vcpu->arch.apicv_active) + if (!apic->apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + preempt_enable(); + up_read(&vcpu->kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *except; - unsigned long old, new, expected; - - if (!kvm_x86_ops.check_apicv_inhibit_reasons || - !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) + if (!lapic_in_kernel(vcpu)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); + /* + * Due to sharing page tables across vCPUs, the xAPIC memslot must be + * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but + * and hardware doesn't support x2APIC virtualization. E.g. some AMD + * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in + * this case so that KVM can use the AVIC doorbell to inject interrupts + * to running vCPUs, but KVM must not create SPTEs for the APIC base as + * the vCPU would incorrectly be able to access the vAPIC page via MMIO + * despite being in x2APIC mode. For simplicity, inhibiting the APIC + * access page is sticky. + */ + if (apic_x2apic_mode(vcpu->arch.apic) && + kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization) + kvm_inhibit_apic_access_page(vcpu); + + __kvm_vcpu_update_apicv(vcpu); +} + +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) +{ + unsigned long old, new; + + lockdep_assert_held_write(&kvm->arch.apicv_update_lock); - if (!!old == !!new) + if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason))) return; - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + old = new = kvm->arch.apicv_inhibit_reasons; - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); + set_or_clear_apicv_inhibit(&new, reason, set); + + if (!!old != !!new) { + /* + * Kick all vCPUs before setting apicv_inhibit_reasons to avoid + * false positives in the sanity check WARN in vcpu_enter_guest(). + * This task will wait for all vCPUs to ack the kick IRQ before + * updating apicv_inhibit_reasons, and all other vCPUs will + * block on acquiring apicv_update_lock so that vCPUs can't + * redo vcpu_enter_guest() without seeing the new inhibit state. + * + * Note, holding apicv_update_lock and taking it in the read + * side (handling the request) also prevents other vCPUs from + * servicing the request with a stale apicv_inhibit_reasons. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + int idx = srcu_read_lock(&kvm->srcu); + + kvm_zap_gfn_range(kvm, gfn, gfn+1); + srcu_read_unlock(&kvm->srcu, idx); + } + } else { + kvm->arch.apicv_inhibit_reasons = new; + } } -EXPORT_SYMBOL_GPL(kvm_request_apicv_update); + +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) +{ + if (!enable_apicv) + return; + + down_write(&kvm->arch.apicv_update_lock); + __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); + up_write(&kvm->arch.apicv_update_lock); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -9299,15 +10978,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) return; bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256); + vcpu->arch.highest_stale_pending_ioapic_eoi = -1; + + kvm_x86_call(sync_pir_to_irr)(vcpu); if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); - else { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - if (ioapic_in_kernel(vcpu->kvm)) - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); - } +#ifdef CONFIG_KVM_IOAPIC + else if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); +#endif if (is_guest_mode(vcpu)) vcpu->arch.load_eoi_exitmap_pending = true; @@ -9317,51 +10997,39 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) { - u64 eoi_exit_bitmap[4]; - if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - if (to_hv_vcpu(vcpu)) +#ifdef CONFIG_KVM_HYPERV + if (to_hv_vcpu(vcpu)) { + u64 eoi_exit_bitmap[4]; + bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, to_hv_synic(vcpu)->vec_bitmap, 256); - - static_call(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + kvm_x86_call(load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + return; + } +#endif + kvm_x86_call(load_eoi_exitmap)( + vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { - unsigned long apic_address; - - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (start <= apic_address && apic_address < end) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + kvm_x86_call(guest_memory_reclaimed)(kvm); } -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops.set_apic_access_page_addr) - return; - - static_call(kvm_x86_set_apic_access_page_addr)(vcpu); + kvm_x86_call(set_apic_access_page_addr)(vcpu); } -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - smp_send_reschedule(vcpu->cpu); -} -EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); - /* + * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. @@ -9373,31 +11041,33 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); fastpath_t exit_fastpath; + u64 run_flags, debug_ctl; bool req_immediate_exit = false; - /* Forbid vmenter if vcpu dirty ring is soft-full */ - if (unlikely(vcpu->kvm->dirty_ring_size && - kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { - vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; - trace_kvm_dirty_ring_exit(vcpu); - r = 0; - goto out; - } - if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + r = -EIO; + goto out; + } + + if (kvm_dirty_ring_check_request(vcpu)) { + r = 0; + goto out; + } + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } } - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) - kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -9409,26 +11079,39 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + + /* + * Note, the order matters here, as flushing "all" TLB entries + * also flushes the "current" TLB entries, i.e. servicing the + * flush "all" will clear any request to flush "current". + */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_all(vcpu); - /* Flushing all ASIDs flushes the current ASID... */ - kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_service_local_tlb_flush_requests(vcpu); + + /* + * Fall back to a "full" guest flush if Hyper-V's precise + * flushing fails. Note, Hyper-V's flushing is per-vCPU, but + * the flushes are considered "remote" and not "local" because + * the requests can be initiated from other vCPUs. + */ +#ifdef CONFIG_KVM_HYPERV + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && + kvm_hv_vcpu_flush_tlb(vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); +#endif if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - if (is_guest_mode(vcpu)) { + if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + if (is_guest_mode(vcpu)) kvm_x86_ops.nested_ops->triple_fault(vcpu); - } else { + + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; vcpu->mmio_needed = 0; r = 0; @@ -9443,14 +11126,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); - if (kvm_check_request(KVM_REQ_SMI, vcpu)) - process_smi(vcpu); - if (kvm_check_request(KVM_REQ_NMI, vcpu)) - process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_pmu_deliver_pmi(vcpu); +#ifdef CONFIG_KVM_SMM + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); +#endif + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + process_nmi(vcpu); if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); if (test_bit(vcpu->arch.pending_ioapic_eoi, @@ -9468,15 +11153,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu_load_eoi_exitmap(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) kvm_vcpu_reload_apic_access_page(vcpu); +#ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } @@ -9496,15 +11184,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); +#endif if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) kvm_check_async_pf_completion(vcpu); - if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) - static_call(kvm_x86_msr_filter_changed)(vcpu); + + if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu)) + kvm_x86_call(recalc_intercepts)(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) - static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); + kvm_x86_call(update_cpu_dirty_logging)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) { + kvm_vcpu_reset(vcpu, true); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) { + r = 1; + goto out; + } + } } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || @@ -9520,13 +11218,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - r = inject_pending_event(vcpu, &req_immediate_exit); + r = kvm_check_and_inject_events(vcpu, &req_immediate_exit); if (r < 0) { r = 0; goto out; } if (req_int_win) - static_call(kvm_x86_enable_irq_window)(vcpu); + kvm_x86_call(enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -9541,7 +11239,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - static_call(kvm_x86_prepare_guest_switch)(vcpu); + kvm_x86_call(prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -9549,9 +11247,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * result in virtual interrupt delivery. */ local_irq_disable(); - vcpu->mode = IN_GUEST_MODE; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* Store vcpu->apicv_active before vcpu->mode. */ + smp_store_release(&vcpu->mode, IN_GUEST_MODE); + + kvm_vcpu_srcu_read_unlock(vcpu); /* * 1) We should set ->mode before checking ->requests. Please see @@ -9568,54 +11268,103 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + kvm_x86_call(sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); r = 1; goto cancel_injection; } + run_flags = 0; if (req_immediate_exit) { + run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT; kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_request_immediate_exit)(vcpu); } fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + + kvm_load_xfeatures(vcpu, true); + + if (unlikely(vcpu->arch.switch_db_regs && + !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) { + set_debugreg(DR7_FIXED_1, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + run_flags |= KVM_RUN_LOAD_GUEST_DR6; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(DR7_FIXED_1, 7); } + /* + * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL + * can be modified in IRQ context, e.g. via SMP function calls. Inform + * vendor code if any host-owned bits were changed, e.g. so that the + * value loaded into hardware while running the guest can be updated. + */ + debug_ctl = get_debugctlmsr(); + if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL && + !vcpu->arch.guest_state_protected) + run_flags |= KVM_RUN_LOAD_DEBUGCTL; + vcpu->arch.host_debugctl = debug_ctl; + + guest_timing_enter_irqoff(); + + /* + * Swap PKRU with hardware breakpoints disabled to minimize the number + * of flows where non-KVM code can run with guest state loaded. + */ + kvm_load_guest_pkru(vcpu); + for (;;) { - exit_fastpath = static_call(kvm_x86_run)(vcpu); + /* + * Assert that vCPU vs. VM APICv state is consistent. An APICv + * update must kick and wait for all vCPUs before toggling the + * per-VM state, and responding vCPUs must wait for the update + * to complete before servicing KVM_REQ_APICV_UPDATE. + */ + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + + exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (unlikely(kvm_vcpu_exit_request(vcpu))) { + if (kvm_lapic_enabled(vcpu)) + kvm_x86_call(sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; break; } - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); - } + run_flags = 0; + + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; + } + + kvm_load_host_pkru(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -9625,10 +11374,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); + WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH); + kvm_x86_call(sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -9647,7 +11396,29 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - static_call(kvm_x86_handle_exit_irqoff)(vcpu); + kvm_load_xfeatures(vcpu, false); + + /* + * Sync xfd before calling handle_exit_irqoff() which may + * rely on the fact that guest_fpu::xfd is up-to-date (e.g. + * in #NM irqoff handler). + */ + if (vcpu->arch.xfd_no_write_intercept) + fpu_sync_guest_vmexit_xfd_state(); + + kvm_x86_call(handle_exit_irqoff)(vcpu); + + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrq(MSR_IA32_XFD_ERR, 0); + + /* + * Mark this CPU as needing a branch predictor flush before running + * userspace. Must be done before enabling preemption to ensure it gets + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ + if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) + this_cpu_write(x86_ibpb_exit_to_user, true); /* * Consume any pending interrupts, including the possible source of @@ -9656,7 +11427,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); @@ -9669,25 +11440,24 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * of accounting via context tracking, but the loss of accuracy is * acceptable for all known use cases. */ - vtime_account_guest_exit(); - - if (lapic_in_kernel(vcpu)) { - s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; - if (delta != S64_MIN) { - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); - vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; - } - } + guest_timing_exit_irqoff(); local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); + + /* + * Call this to ensure WC buffers in guest are evicted after each VM + * Exit, so that the evicted WC writes can be snooped across all cpus + */ + smp_mb__after_srcu_read_lock(); /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -9698,42 +11468,138 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); + if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE)) + return 0; + + r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_cancel_injection)(vcpu); + kvm_x86_call(cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: return r; } -static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) +static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted); +} + +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +{ + if (!list_empty_careful(&vcpu->async_pf.done)) + return true; + + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) + return true; - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); + if (kvm_is_exception_pending(vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || + (vcpu->arch.nmi_pending && + kvm_x86_call(nmi_allowed)(vcpu, false))) + return true; + +#ifdef CONFIG_KVM_SMM + if (kvm_test_request(KVM_REQ_SMI, vcpu) || + (vcpu->arch.smi_pending && + kvm_x86_call(smi_allowed)(vcpu, false))) + return true; +#endif + + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + + if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) + return true; + + if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)) + return true; - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + if (kvm_hv_has_stimer_pending(vcpu)) + return true; + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu, false)) + return true; + + if (kvm_xen_has_pending_events(vcpu)) + return true; + + return false; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events); + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); +} + +/* Called within kvm->srcu read side. */ +static inline int vcpu_block(struct kvm_vcpu *vcpu) +{ + bool hv_timer; + + if (!kvm_arch_vcpu_runnable(vcpu)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); + + kvm_vcpu_srcu_read_unlock(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + kvm_vcpu_halt(vcpu); + else + kvm_vcpu_block(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + /* + * If the vCPU is not runnable, a signal or another host event + * of some kind is pending; service it without changing the + * vCPU's activity state. + */ + if (!kvm_arch_vcpu_runnable(vcpu)) return 1; } + /* + * Evaluate nested events before exiting the halted state. This allows + * the halt state to be recorded properly in the VMCS12's activity + * state field (AMD does not have a similar field and a VM-Exit always + * causes a spurious wakeup from HLT). + */ + if (is_guest_mode(vcpu)) { + int r = kvm_check_nested_events(vcpu); + + WARN_ON_ONCE(r == -EBUSY); + if (r < 0) + return 0; + } + if (kvm_apic_accept_events(vcpu) < 0) return 0; switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -9741,39 +11607,40 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) case KVM_MP_STATE_INIT_RECEIVED: break; default: - return -EINTR; + WARN_ON_ONCE(1); + break; } return 1; } -static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu)) - kvm_check_nested_events(vcpu); - - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && - !vcpu->arch.apf.halted); -} - +/* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; - struct kvm *kvm = vcpu->kvm; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - vcpu->arch.l1tf_flush_l1d = true; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; for (;;) { + /* + * If another guest vCPU requests a PV TLB flush in the middle + * of instruction emulation, the rest of the emulation could + * use a stale page translation. Assume that any code after + * this point can start executing an instruction. + */ + vcpu->arch.at_instruction_boundary = false; if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { - r = vcpu_block(kvm, vcpu); + r = vcpu_block(vcpu); } if (r <= 0) break; kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); + if (kvm_xen_has_pending_events(vcpu)) + kvm_xen_inject_pending_events(vcpu); + if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -9786,27 +11653,105 @@ static int vcpu_run(struct kvm_vcpu *vcpu) } if (__xfer_to_guest_mode_work_pending()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - r = xfer_to_guest_mode_handle_work(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + r = kvm_xfer_to_guest_mode_handle_work(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; } -static inline int complete_emulated_io(struct kvm_vcpu *vcpu) +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { - int r; + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ + ++vcpu->stat.halt_exits; + if (lapic_in_kernel(vcpu)) { + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); + return 1; + } else { + vcpu->run->exit_reason = reason; + return 0; + } +} - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - return r; +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_emulate_halt_noskip(vcpu) && ret; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); + +fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) +{ + if (!kvm_emulate_halt(vcpu)) + return EXIT_FASTPATH_EXIT_USERSPACE; + + if (kvm_vcpu_running(vcpu)) + return EXIT_FASTPATH_REENTER_GUEST; + + return EXIT_FASTPATH_EXIT_HANDLED; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt); + +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold); + +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_apicv_active(vcpu) && + kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu); +} + +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; +} + +bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +{ + if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) + return true; + + if (kvm_test_request(KVM_REQ_NMI, vcpu) || +#ifdef CONFIG_KVM_SMM + kvm_test_request(KVM_REQ_SMI, vcpu) || +#endif + kvm_test_request(KVM_REQ_EVENT, vcpu)) + return true; + + return kvm_arch_dy_has_pending_interrupt(vcpu); +} + +static inline int complete_emulated_io(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -9879,83 +11824,91 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - save_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - - /* - * Guests with protected state can't have it set by the hypervisor, - * so skip trying to set it. - */ - if (vcpu->arch.guest_fpu) - /* PKRU is separately restored in kvm_x86_ops.run. */ - __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); + if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); + if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm)) + return; + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + +static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ /* - * Guests with protected state can't have it read by the hypervisor, - * so skip trying to save it. + * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and + * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted + * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be + * converted to INIT_RECEIVED. */ - if (vcpu->arch.guest_fpu) - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state); + if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) + return -EINVAL; - fpregs_mark_activate(); - fpregs_unlock(); + /* + * Disallow running the vCPU if userspace forced it into an impossible + * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked. + */ + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED && + !kvm_apic_init_sipi_allowed(vcpu)) + return -EINVAL; - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); + return kvm_x86_call(vcpu_pre_run)(vcpu); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; + u64 sync_valid_fields; int r; + r = kvm_mmu_post_init_vm(vcpu->kvm); + if (r) + return r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { - if (kvm_run->immediate_exit) { + if (!vcpu->wants_to_run) { r = -EINTR; goto out; } + + /* + * Don't bother switching APIC timer emulation from the + * hypervisor timer to the software timer, the only way for the + * APIC timer to be active is if userspace stuffed vCPU state, + * i.e. put the vCPU into a nonsensical state. Only an INIT + * will transition the vCPU out of UNINITIALIZED (without more + * state stuffing from userspace), which will reset the local + * APIC and thus cancel the timer or drop the IRQ (if the timer + * already expired). + */ + kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + if (kvm_apic_accept_events(vcpu) < 0) { r = 0; goto out; } - kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; @@ -9965,7 +11918,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm); + if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) || + (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) { r = -EINVAL; goto out; } @@ -9984,27 +11939,51 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } } + /* + * If userspace set a pending exception and L2 is active, convert it to + * a pending VM-Exit if L1 wants to intercept the exception. + */ + if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, + ex->error_code)) { + kvm_queue_exception_vmexit(vcpu, ex->vector, + ex->has_error_code, ex->error_code, + ex->has_payload, ex->payload); + ex->injected = false; + ex->pending = false; + } + vcpu->arch.exception_from_userspace = false; + if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } - if (kvm_run->immediate_exit) + if (!vcpu->wants_to_run) { r = -EINTR; - else - r = vcpu_run(vcpu); + goto out; + } + + r = kvm_x86_vcpu_pre_run(vcpu); + if (r <= 0) + goto out; + + r = vcpu_run(vcpu); out: kvm_put_guest_fpu(vcpu); - if (kvm_run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected)) store_regs(vcpu); post_kvm_run_save(vcpu); - kvm_sigset_deactivate(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); return r; } @@ -10047,6 +12026,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_regs(vcpu, regs); vcpu_put(vcpu); @@ -10081,28 +12064,23 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; + vcpu->arch.exception_vmexit.pending = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __set_regs(vcpu, regs); vcpu_put(vcpu); return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; @@ -10120,10 +12098,10 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - static_call(kvm_x86_get_idt)(vcpu, &dt); + kvm_x86_call(get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - static_call(kvm_x86_get_gdt)(vcpu, &dt); + kvm_x86_call(get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -10135,7 +12113,7 @@ skip_protected_regs: sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; - sregs->apic_base = kvm_get_apic_base(vcpu); + sregs->apic_base = vcpu->arch.apic_base; } static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -10169,6 +12147,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); __get_sregs(vcpu, sregs); vcpu_put(vcpu); @@ -10181,8 +12163,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int r; vcpu_load(vcpu); - if (kvm_mpx_supported()) - kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); r = kvm_apic_accept_events(vcpu); if (r < 0) @@ -10197,8 +12178,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state = vcpu->arch.mp_state; out: - if (kvm_mpx_supported()) - kvm_put_guest_fpu(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); vcpu_put(vcpu); return r; } @@ -10210,25 +12190,34 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + switch (mp_state->mp_state) { + case KVM_MP_STATE_UNINITIALIZED: + case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: + case KVM_MP_STATE_INIT_RECEIVED: + case KVM_MP_STATE_SIPI_RECEIVED: + if (!lapic_in_kernel(vcpu)) + goto out; + break; + + case KVM_MP_STATE_RUNNABLE: + break; + + default: goto out; + } /* - * KVM_MP_STATE_INIT_RECEIVED means the processor is in - * INIT state; latched init should be reported using - * KVM_SET_VCPU_EVENTS, so reject it here. + * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead + * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI. + * Translate SIPI_RECEIVED as appropriate for backwards compatibility. */ - if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && - (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || - mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) - goto out; - if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); - } else - vcpu->arch.mp_state = mp_state->mp_state; + } + + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -10243,22 +12232,49 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { + u64 u_cet, s_cet; + + /* + * Check both User and Supervisor on task switches as inter- + * privilege level task switches are impacted by CET at both + * the current privilege level and the new privilege level, and + * that information is not known at this time. The expectation + * is that the guest won't require emulation of task switches + * while using IBT or Shadow Stacks. + */ + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) + goto unhandled_task_switch; + + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) + goto unhandled_task_switch; + } + init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - if (ret) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + + /* + * Report an error userspace if MMIO is needed, as KVM doesn't support + * MMIO during a task switch (or any other complex operation). + */ + if (ret || vcpu->mmio_needed) + goto unhandled_task_switch; kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); return 1; + +unhandled_task_switch: + vcpu->mmio_needed = false; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } -EXPORT_SYMBOL_GPL(kvm_task_switch); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch); static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -10270,7 +12286,7 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) return false; - if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + if (!kvm_vcpu_is_legal_cr3(vcpu, sregs->cr3)) return false; } else { /* @@ -10281,22 +12297,20 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return false; } - return kvm_is_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4) && + kvm_is_valid_cr0(vcpu, sregs->cr0); } static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, int *mmu_reset_needed, bool update_pdptrs) { - struct msr_data apic_base_msr; int idx; struct desc_ptr dt; if (!kvm_is_valid_sregs(vcpu, sregs)) return -EINVAL; - apic_base_msr.data = sregs->apic_base; - apic_base_msr.host_initiated = true; - if (kvm_set_apic_base(vcpu, &apic_base_msr)) + if (kvm_apic_set_base(vcpu, sregs->apic_base, true)) return -EINVAL; if (vcpu->arch.guest_state_protected) @@ -10304,32 +12318,32 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - static_call(kvm_x86_set_idt)(vcpu, &dt); + kvm_x86_call(set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - static_call(kvm_x86_set_gdt)(vcpu, &dt); + kvm_x86_call(set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + kvm_x86_call(post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - static_call(kvm_x86_set_efer)(vcpu, sregs->efer); + kvm_x86_call(set_efer)(vcpu, sregs->efer); *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); - vcpu->arch.cr0 = sregs->cr0; + kvm_x86_call(set_cr0)(vcpu, sregs->cr0); *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); + kvm_x86_call(set_cr4)(vcpu, sregs->cr4); if (update_pdptrs) { idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); *mmu_reset_needed = 1; } srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -10351,7 +12365,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -10365,8 +12379,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (ret) return ret; - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } max_bits = KVM_NR_INTERRUPTS; pending_vec = find_first_bit( @@ -10407,8 +12423,10 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) mmu_reset_needed = 1; vcpu->arch.pdptrs_from_userspace = true; } - if (mmu_reset_needed) + if (mmu_reset_needed) { kvm_mmu_reset_context(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } return 0; } @@ -10417,12 +12435,37 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int ret; + if (vcpu->kvm->arch.has_protected_state && + vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); ret = __set_sregs(vcpu, sregs); vcpu_put(vcpu); return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool set = false; + struct kvm_vcpu *vcpu; + unsigned long i; + + if (!enable_apicv) + return; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + set = true; + break; + } + } + __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { @@ -10436,7 +12479,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); @@ -10473,7 +12516,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_x86_call(update_exception_bitmap)(vcpu); + + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); r = 0; @@ -10498,7 +12543,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -10510,12 +12555,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) - return 0; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -10533,12 +12578,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; - if (!vcpu->arch.guest_fpu) - return 0; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0; vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -10570,62 +12615,45 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { - if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs)) + struct kvm_sregs sregs = vcpu->run->s.regs.sregs; + + if (__set_sregs(vcpu, &sregs)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; } + if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { - if (kvm_vcpu_ioctl_x86_set_vcpu_events( - vcpu, &vcpu->run->s.regs.events)) + struct kvm_vcpu_events events = vcpu->run->s.regs.events; + + if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events)) return -EINVAL; + vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; } return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.guest_fpu) - return; - - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - -void kvm_free_guest_fpu(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_fpu) { - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); - vcpu->arch.guest_fpu = NULL; - } -} -EXPORT_SYMBOL_GPL(kvm_free_guest_fpu); - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + if (kvm_check_tsc_unstable() && kvm->created_vcpus) + pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return kvm_x86_call(vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -10634,24 +12662,23 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) int r; vcpu->arch.last_vmentry_cpu = -1; + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) return r; - if (irqchip_in_kernel(vcpu->kvm)) { - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - if (kvm_apicv_activated(vcpu->kvm)) - vcpu->arch.apicv_active = true; - } else - static_branch_inc(&kvm_has_noapic_vcpu); + r = kvm_create_lapic(vcpu); + if (r < 0) + goto fail_mmu_destroy; r = -ENOMEM; @@ -10660,10 +12687,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) - goto fail_free_pio_data; + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) + goto fail_free_mce_banks; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, @@ -10673,27 +12702,18 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { + pr_err("failed to allocate vcpu's fpu\n"); goto free_emulate_ctxt; } - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { - pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; - } - fx_init(vcpu); - - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - kvm_async_pf_hash_reset(vcpu); + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) { + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; + } kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; @@ -10703,31 +12723,28 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.hv_root_tdp = INVALID_PAGE; #endif - r = static_call(kvm_x86_vcpu_create)(vcpu); + r = kvm_x86_call(vcpu_create)(vcpu); if (r) goto free_guest_fpu; - vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); - vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; - kvm_vcpu_mtrr_init(vcpu); + kvm_xen_init_vcpu(vcpu); vcpu_load(vcpu); - kvm_set_tsc_khz(vcpu, max_tsc_khz); + kvm_vcpu_after_set_cpuid(vcpu); + kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); kvm_init_mmu(vcpu); vcpu_put(vcpu); return 0; free_guest_fpu: - kvm_free_guest_fpu(vcpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); free_emulate_ctxt: kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); -fail_free_pio_data: + kfree(vcpu->arch.mci_ctl2_banks); free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); @@ -10738,59 +12755,124 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - kvm_synchronize_tsc(vcpu, 0); + kvm_synchronize_tsc(vcpu, NULL); vcpu_put(vcpu); /* poll control enabled by default */ vcpu->arch.msr_kvm_poll_control = 1; mutex_unlock(&vcpu->mutex); - - if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; - int idx; + int idx, cpu; - kvm_release_pfn(cache->pfn, cache->dirty, cache); + kvm_clear_async_pf_completion_queue(vcpu); + kvm_mmu_unload(vcpu); kvmclock_reset(vcpu); - static_call(kvm_x86_vcpu_free)(vcpu); + for_each_possible_cpu(cpu) + cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL); + + kvm_x86_call(vcpu_free)(vcpu); kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kvm_free_guest_fpu(vcpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); + kvm_xen_destroy_vcpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); - if (!lapic_in_kernel(vcpu)) - static_branch_dec(&kvm_has_noapic_vcpu); +} + +static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) +{ + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; + u64 xfeatures_mask; + bool fpu_in_use; + int i; + + /* + * Guest FPU state is zero allocated and so doesn't need to be manually + * cleared on RESET, i.e. during vCPU creation. + */ + if (!init_event || !fpstate) + return; + + /* + * On INIT, only select XSTATE components are zeroed, most components + * are unchanged. Currently, the only components that are zeroed and + * supported by KVM are MPX and CET related. + */ + xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_CET_ALL); + if (!xfeatures_mask) + return; + + BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX); + + /* + * Unload guest FPU state (if necessary) before zeroing XSTATE fields + * as the kernel can only modify the state when its resident in memory, + * i.e. when it's not loaded into hardware. + * + * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN, + * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the + * only path that can trigger INIT emulation _and_ loads FPU state, and + * KVM_RUN should _always_ load FPU state. + */ + WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use); + fpu_in_use = fpstate->in_use; + if (fpu_in_use) + kvm_put_guest_fpu(vcpu); + for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX) + fpstate_clear_xstate_component(fpstate, i); + if (fpu_in_use) + kvm_load_guest_fpu(vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); + + /* + * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's + * possible to INIT the vCPU while L2 is active. Force the vCPU back + * into L1 as EFER.SVME is cleared on INIT (along with all other EFER + * bits), i.e. virtualization is disabled. + */ + if (is_guest_mode(vcpu)) + kvm_leave_nested(vcpu); kvm_lapic_reset(vcpu, init_event); + WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); vcpu->arch.hflags = 0; vcpu->arch.smi_pending = 0; @@ -10820,55 +12902,85 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (vcpu->arch.guest_fpu && kvm_mpx_supported()) { - void *mpx_state_buffer; - - /* - * To avoid have the INIT path from kvm_apic_has_events() that be - * called with loaded FPU and does not let userspace fix the state. - */ - if (init_event) - kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); - if (init_event) - kvm_load_guest_fpu(vcpu); - } + kvm_xstate_reset(vcpu, init_event); if (!init_event) { - kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; - vcpu->arch.xcr0 = XFEATURE_MASK_FP; + __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + kvm_msr_write(vcpu, MSR_IA32_XSS, 0); } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); - vcpu->arch.ia32_xss = 0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. + */ + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); - static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_x86_call(vcpu_reset)(vcpu, init_event); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + vcpu->arch.cr3 = 0; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); /* - * Reset the MMU context if paging was enabled prior to INIT (which is - * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the - * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be - * checked because it is unconditionally cleared on INIT and all other - * paging related bits are ignored if paging is disabled, i.e. CR0.WP, - * CR4, and EFER changes are all irrelevant if CR0.PG was '0'. + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. */ - if (old_cr0 & X86_CR0_PG) + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + kvm_x86_call(set_cr0)(vcpu, new_cr0); + kvm_x86_call(set_cr4)(vcpu, 0); + kvm_x86_call(set_efer)(vcpu, 0); + kvm_x86_call(update_exception_bitmap)(vcpu); + + /* + * On the standard CR0/CR4/EFER modification paths, there are several + * complex conditions determining whether the MMU has to be reset and/or + * which PCIDs have to be flushed. However, CR0.WP and the paging-related + * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush + * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as + * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here. + */ + if (old_cr0 & X86_CR0_PG) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); kvm_mmu_reset_context(vcpu); + } + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -10880,20 +12992,35 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } -EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector); + +void kvm_arch_enable_virtualization(void) +{ + cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} -int kvm_arch_hardware_enable(void) +void kvm_arch_disable_virtualization(void) +{ + cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu); +} + +int kvm_arch_enable_virtualization_cpu(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; int ret; u64 local_tsc; u64 max_tsc = 0; bool stable, backwards_tsc = false; kvm_user_return_msr_cpu_online(); - ret = static_call(kvm_x86_hardware_enable)(); + + ret = kvm_x86_check_processor_compatibility(); + if (ret) + return ret; + + ret = kvm_x86_call(enable_virtualization_cpu)(); if (ret != 0) return ret; @@ -10973,196 +13100,117 @@ int kvm_arch_hardware_enable(void) return 0; } -void kvm_arch_hardware_disable(void) -{ - static_call(kvm_x86_hardware_disable)(); - drop_user_return_notifiers(); -} - -int kvm_arch_hardware_setup(void *opaque) +void kvm_arch_disable_virtualization_cpu(void) { - struct kvm_x86_init_ops *ops = opaque; - int r; - - rdmsrl_safe(MSR_EFER, &host_efer); - if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) && - !(host_efer & EFER_NX))) - return -EIO; + kvm_x86_call(disable_virtualization_cpu)(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - - r = ops->hardware_setup(); - if (r != 0) - return r; - - memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); - kvm_ops_static_call_update(); - - if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) - supported_xss = 0; - -#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) - cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); -#undef __kvm_cpu_cap_has - - if (kvm_has_tsc_control) { - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated because it will always - * be 1 on all machines. - */ - u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; - } - - kvm_init_msr_list(); - return 0; -} - -void kvm_arch_hardware_unsetup(void) -{ - static_call(kvm_x86_hardware_unsetup)(); -} - -int kvm_arch_check_processor_compat(void *opaque) -{ - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - struct kvm_x86_init_ops *ops = opaque; - - WARN_ON(!irqs_disabled()); - - if (__cr4_reserved_bits(cpu_has, c) != - __cr4_reserved_bits(cpu_has, &boot_cpu_data)) - return -EIO; - - return ops->check_processor_compatibility(); + /* + * Leave the user-return notifiers as-is when disabling virtualization + * for reboot, i.e. when disabling via IPI function call, and instead + * pin kvm.ko (if it's a module) to defend against use-after-free (in + * the *very* unlikely scenario module unload is racing with reboot). + * On a forced reboot, tasks aren't frozen before shutdown, and so KVM + * could be actively modifying user-return MSR state when the IPI to + * disable virtualization arrives. Handle the extreme edge case here + * instead of trying to account for it in the normal flows. + */ + if (in_task() || WARN_ON_ONCE(!kvm_rebooting)) + drop_user_return_notifiers(); + else + __module_get(THIS_MODULE); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); -EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); - -void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - vcpu->arch.l1tf_flush_l1d = true; - if (pmu->version && unlikely(pmu->event_count)) { - pmu->need_cleanup = true; - kvm_make_request(KVM_REQ_PMU, vcpu); - } - static_call(kvm_x86_sched_in)(vcpu, cpu); -} - void kvm_arch_free_vm(struct kvm *kvm) { - kfree(to_kvm_hv(kvm)->hv_pa_pg); - vfree(kvm); +#if IS_ENABLED(CONFIG_HYPERV) + kfree(kvm->arch.hv_pa_pg); +#endif + __kvm_arch_free_vm(kvm); } int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { - if (type) + int ret; + unsigned long flags; + + if (!kvm_is_vm_type_supported(type)) return -EINVAL; - INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); - atomic_set(&kvm->arch.noncoherent_dma_count, 0); + kvm->arch.vm_type = type; + kvm->arch.has_private_mem = + (type == KVM_X86_SW_PROTECTED_VM); + /* Decided by the vendor code for other VM types. */ + kvm->arch.pre_fault_allowed = + type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM; + kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks; - /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ - set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ - set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, - &kvm->arch.irq_sources_bitmap); + ret = kvm_page_track_init(kvm); + if (ret) + goto out; + + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_cleanup_page_track; + + ret = kvm_x86_call(vm_init)(kvm); + if (ret) + goto out_uninit_mmu; + + atomic_set(&kvm->arch.noncoherent_dma_count, 0); raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - + seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); pvclock_update_vm_gtod_copy(kvm); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; + kvm->arch.apic_bus_cycle_ns = APIC_BUS_CYCLE_NS_DEFAULT; kvm->arch.guest_can_read_msr_platform_info = true; + kvm->arch.enable_pmu = enable_pmu; #if IS_ENABLED(CONFIG_HYPERV) spin_lock_init(&kvm->arch.hv_root_tdp_lock); kvm->arch.hv_root_tdp = INVALID_PAGE; #endif - INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); - INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); - kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); - kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); - return static_call(kvm_x86_vm_init)(kvm); -} - -int kvm_arch_post_init_vm(struct kvm *kvm) -{ - return kvm_mmu_post_init_vm(kvm); -} - -static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) -{ - vcpu_load(vcpu); - kvm_mmu_unload(vcpu); - vcpu_put(vcpu); -} - -static void kvm_free_vcpus(struct kvm *kvm) -{ - unsigned int i; - struct kvm_vcpu *vcpu; - - /* - * Unpin any mmu pages first. - */ - kvm_for_each_vcpu(i, vcpu, kvm) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_unload_vcpu_mmu(vcpu); + if (ignore_msrs && !report_ignored_msrs) { + pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n" + "a supported configuration. Lying to the guest about the existence of MSRs\n" + "may cause the guest operating system to hang or produce errors. If a guest\n" + "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n"); } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); -} + once_init(&kvm->arch.nx_once); + return 0; -void kvm_arch_sync_events(struct kvm *kvm) -{ - cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); - cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_pit(kvm); +out_uninit_mmu: + kvm_mmu_uninit_vm(kvm); +out_cleanup_page_track: + kvm_page_track_cleanup(kvm); +out: + return ret; } -#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) - /** * __x86_set_memory_region: Setup KVM internal memory slot * @@ -11193,7 +13241,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; - /* Called with kvm->slots_lock held. */ + lockdep_assert_held(&kvm->slots_lock); + if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) return ERR_PTR_USR(-EINVAL); @@ -11208,7 +13257,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, */ hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)hva)) + if (IS_ERR_VALUE(hva)) return (void __user *)hva; } else { if (!slot || !slot->npages) @@ -11218,15 +13267,15 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, hva = slot->userspace_addr; } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; m.guest_phys_addr = gpa; m.userspace_addr = hva; m.memory_size = size; - r = __kvm_set_memory_region(kvm, &m); + r = kvm_set_internal_memslot(kvm, &m); if (r < 0) return ERR_PTR_USR(r); } @@ -11236,11 +13285,22 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, return (void __user *)hva; } -EXPORT_SYMBOL_GPL(__x86_set_memory_region); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { + /* + * Stop all background workers and kthreads before destroying vCPUs, as + * iterating over vCPUs in a different task while vCPUs are being freed + * is unsafe, i.e. will lead to use-after-free. The PIT also needs to + * be stopped before IRQ routing is freed. + */ +#ifdef CONFIG_KVM_IOAPIC + kvm_free_pit(kvm); +#endif + kvm_mmu_pre_destroy_vm(kvm); + static_call_cond(kvm_x86_vm_pre_destroy)(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) @@ -11248,7 +13308,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, - * unless the the memory map has changed due to process exit + * unless the memory map has changed due to process exit * or fd copying. */ mutex_lock(&kvm->slots_lock); @@ -11259,17 +13319,19 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - static_call_cond(kvm_x86_vm_destroy)(kvm); + kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); +#ifdef CONFIG_KVM_IOAPIC kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_free_vcpus(kvm); +#endif kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); + kvm_x86_call(vm_destroy)(kvm); } static void memslot_rmap_free(struct kvm_memory_slot *slot) @@ -11277,7 +13339,7 @@ static void memslot_rmap_free(struct kvm_memory_slot *slot) int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); + vfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; } } @@ -11289,27 +13351,26 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } kvm_page_track_free_memslot(slot); } -static int memslot_rmap_alloc(struct kvm_memory_slot *slot, - unsigned long npages) +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) { const int sz = sizeof(*slot->arch.rmap[0]); int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); - WARN_ON(slot->arch.rmap[i]); + if (slot->arch.rmap[i]) + continue; - slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) { memslot_rmap_free(slot); return -ENOMEM; @@ -11319,59 +13380,15 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, return 0; } -int alloc_all_memslots_rmaps(struct kvm *kvm) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - int r, i; - - /* - * Check if memslots alreday have rmaps early before acquiring - * the slots_arch_lock below. - */ - if (kvm_memslots_have_rmaps(kvm)) - return 0; - - mutex_lock(&kvm->slots_arch_lock); - - /* - * Read memslots_have_rmaps again, under the slots arch lock, - * before allocating the rmaps - */ - if (kvm_memslots_have_rmaps(kvm)) { - mutex_unlock(&kvm->slots_arch_lock); - return 0; - } - - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { - r = memslot_rmap_alloc(slot, slot->npages); - if (r) { - mutex_unlock(&kvm->slots_arch_lock); - return r; - } - } - } - - /* - * Ensure that memslots_have_rmaps becomes true strictly after - * all the rmap pointers are set. - */ - smp_store_release(&kvm->arch.memslots_have_rmaps, true); - mutex_unlock(&kvm->slots_arch_lock); - return 0; -} - static int kvm_alloc_memslot_metadata(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages) + struct kvm_memory_slot *slot) { + unsigned long npages = slot->npages; int i, r; /* * Clear out the previous array pointers for the KVM_MR_MOVE case. The - * old arrays will be freed by __kvm_set_memory_region() if installing + * old arrays will be freed by kvm_set_memory_region() if installing * the new memslot is successful. */ memset(&slot->arch, 0, sizeof(slot->arch)); @@ -11388,10 +13405,9 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); + linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -11414,7 +13430,11 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } - if (kvm_page_track_create_memslot(slot, npages)) +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + kvm_mmu_init_memslot_memory_attributes(kvm, slot); +#endif + + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; @@ -11423,7 +13443,7 @@ out_free: memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; @@ -11432,7 +13452,7 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; /* * memslots->generation has been incremented. @@ -11446,43 +13466,62 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { - if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) - return kvm_alloc_memslot_metadata(kvm, memslot, - mem->memory_size >> PAGE_SHIFT); + /* + * KVM doesn't support moving memslots when there are external page + * trackers attached to the VM, i.e. if KVMGT is in use. + */ + if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm)) + return -EINVAL; + + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { + if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) + return -EINVAL; + + if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1)) + return -EINVAL; + + return kvm_alloc_memslot_metadata(kvm, new); + } + + if (change == KVM_MR_FLAGS_ONLY) + memcpy(&new->arch, &old->arch, sizeof(old->arch)); + else if (WARN_ON_ONCE(change != KVM_MR_DELETE)) + return -EIO; + return 0; } static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { - struct kvm_arch *ka = &kvm->arch; + int nr_slots; - if (!kvm_x86_ops.cpu_dirty_log_size) + if (!kvm->arch.cpu_dirty_log_size) return; - if ((enable && ++ka->cpu_dirty_logging_count == 1) || - (!enable && --ka->cpu_dirty_logging_count == 0)) + nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging); + if ((enable && nr_slots == 1) || !nr_slots) kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); - - WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; + u32 old_flags = old ? old->flags : 0; + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; /* * Update CPU dirty logging if dirty logging is being toggled. This * applies to all operations. */ - if ((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES) + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); /* @@ -11500,7 +13539,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * MOVE/DELETE: The old mappings will already have been cleaned up by * kvm_arch_flush_shadow_memslot(). */ - if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) return; /* @@ -11508,24 +13547,20 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty * logging isn't being toggled on or off. */ - if (WARN_ON_ONCE(!((old->flags ^ new->flags) & KVM_MEM_LOG_DIRTY_PAGES))) + if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) return; if (!log_dirty_pages) { /* - * Dirty logging tracks sptes in 4k granularity, meaning that - * large sptes have to be split. If live migration succeeds, - * the guest in the source machine will be destroyed and large - * sptes will be created in the destination. However, if the - * guest continues to run in the source machine (for example if - * live migration fails), small sptes will remain around and - * cause bad performance. + * Recover huge page mappings in the slot now that dirty logging + * is disabled, i.e. now that KVM does not have to track guest + * writes at 4KiB granularity. * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. + * Dirty logging might be disabled by userspace if an ongoing VM + * live migration is cancelled and the VM must continue running + * on the source. */ - kvm_mmu_zap_collapsible_sptes(kvm, new); + kvm_mmu_recover_huge_pages(kvm, new); } else { /* * Initially-all-set does not require write protecting any page, @@ -11534,125 +13569,104 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, if (kvm_dirty_log_manual_protect_and_init_set(kvm)) return; - if (kvm_x86_ops.cpu_dirty_log_size) { + if (READ_ONCE(eager_page_split)) + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); + + if (kvm->arch.cpu_dirty_log_size) { kvm_mmu_slot_leaf_clear_dirty(kvm, new); kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); } else { kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } + + /* + * Unconditionally flush the TLBs after enabling dirty logging. + * A flush is almost always going to be necessary (see below), + * and unconditionally flushing allows the helpers to omit + * the subtly complex checks when removing write access. + * + * Do the flush outside of mmu_lock to reduce the amount of + * time mmu_lock is held. Flushing after dropping mmu_lock is + * safe as KVM only needs to guarantee the slot is fully + * write-protected before returning to userspace, i.e. before + * userspace can consume the dirty status. + * + * Flushing outside of mmu_lock requires KVM to be careful when + * making decisions based on writable status of an SPTE, e.g. a + * !writable SPTE doesn't guarantee a CPU can't perform writes. + * + * Specifically, KVM also write-protects guest page tables to + * monitor changes when using shadow paging, and must guarantee + * no CPUs can write to those page before mmu_lock is dropped. + * Because CPUs may have stale TLB entries at this point, a + * !writable SPTE doesn't guarantee CPUs can't perform writes. + * + * KVM also allows making SPTES writable outside of mmu_lock, + * e.g. to allow dirty logging without taking mmu_lock. + * + * To handle these scenarios, KVM uses a separate software-only + * bit (MMU-writable) to track if a SPTE is !writable due to + * a guest page table being write-protected (KVM clears the + * MMU-writable flag when write-protecting for shadow paging). + * + * The use of MMU-writable is also the primary motivation for + * the unconditional flush. Because KVM must guarantee that a + * CPU doesn't contain stale, writable TLB entries for a + * !MMU-writable SPTE, KVM must flush if it encounters any + * MMU-writable SPTE regardless of whether the actual hardware + * writable bit was set. I.e. KVM is almost guaranteed to need + * to flush, while unconditionally flushing allows the "remove + * write access" helpers to ignore MMU-writable entirely. + * + * See is_writable_pte() for more details (the case involving + * access-tracked SPTEs is particularly relevant). + */ + kvm_flush_remote_tlbs_memslot(kvm, new); } } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - if (!kvm->arch.n_requested_mmu_pages) - kvm_mmu_change_mmu_pages(kvm, - kvm_mmu_calculate_default_mmu_pages(kvm)); + if (change == KVM_MR_DELETE) + kvm_page_track_delete_slot(kvm, old); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + if (!kvm->arch.n_requested_mmu_pages && + (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { + unsigned long nr_mmu_pages; + + nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO; + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } + + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) kvm_arch_free_memslot(kvm, old); } -void kvm_arch_flush_shadow_all(struct kvm *kvm) -{ - kvm_mmu_zap_all(kvm); -} - -void kvm_arch_flush_shadow_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - kvm_page_track_flush_slot(kvm, slot); -} - -static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) -{ - return (is_guest_mode(vcpu) && - kvm_x86_ops.guest_apic_has_interrupt && - static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); -} - -static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) -{ - if (!list_empty_careful(&vcpu->async_pf.done)) - return true; - - if (kvm_apic_has_events(vcpu)) - return true; - - if (vcpu->arch.pv.pv_unhalted) - return true; - - if (vcpu->arch.exception.pending) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || - (vcpu->arch.nmi_pending && - static_call(kvm_x86_nmi_allowed)(vcpu, false))) - return true; - - if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && - static_call(kvm_x86_smi_allowed)(vcpu, false))) - return true; - - if (kvm_arch_interrupt_allowed(vcpu) && - (kvm_cpu_has_interrupt(vcpu) || - kvm_guest_apic_has_interrupt(vcpu))) - return true; - - if (kvm_hv_has_stimer_pending(vcpu)) - return true; - - if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) - return true; - - return false; -} - -int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) +bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); -} + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); -bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.apicv_active && static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + if (vcpu->arch.guest_state_protected) return true; - return false; + return kvm_x86_call(get_cpl)(vcpu) == 0; } -bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { - if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) - return true; - - if (kvm_test_request(KVM_REQ_NMI, vcpu) || - kvm_test_request(KVM_REQ_SMI, vcpu) || - kvm_test_request(KVM_REQ_EVENT, vcpu)) - return true; + WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)); - return kvm_arch_dy_has_pending_interrupt(vcpu); -} - -bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) -{ if (vcpu->arch.guest_state_protected) - return true; + return 0; - return vcpu->arch.preempted_in_kernel; + return kvm_rip_read(vcpu); } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) @@ -11662,7 +13676,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return static_call(kvm_x86_interrupt_allowed)(vcpu, false); + return kvm_x86_call(interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -11676,31 +13690,31 @@ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + kvm_rip_read(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_get_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) { return kvm_get_linear_rip(vcpu) == linear_rip; } -EXPORT_SYMBOL_GPL(kvm_is_linear_rip); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = static_call(kvm_x86_get_rflags)(vcpu); + rflags = kvm_x86_call(get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; } -EXPORT_SYMBOL_GPL(kvm_get_rflags); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - static_call(kvm_x86_set_rflags)(vcpu, rflags); + kvm_x86_call(set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -11708,26 +13722,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } -EXPORT_SYMBOL_GPL(kvm_set_rflags); - -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) - return; - - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); -} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags); static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { @@ -11826,21 +13821,35 @@ static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + + if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (!kvm_pv_async_pf_enabled(vcpu) || - (vcpu->arch.apf.send_user_only && static_call(kvm_x86_get_cpl)(vcpu) == 0)) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; - return true; + if (is_guest_mode(vcpu)) { + /* + * L1 needs to opt into the special #PF vmexits that are + * used to deliver async page faults. + */ + return vcpu->arch.apf.delivery_as_pf_vmexit; + } else { + /* + * Play it safe in case the guest temporarily disables paging. + * The real mode IDT in particular is unlikely to have a #PF + * exception setup. + */ + return is_paging(vcpu); + } } bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) + kvm_is_exception_pending(vcpu))) return false; if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) @@ -11902,18 +13911,22 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, if ((work->wakeup_all || work->notpresent_injected) && kvm_pv_async_pf_enabled(vcpu) && !apf_put_user_ready(vcpu, work->arch.token)) { - vcpu->arch.apf.pageready_pending = true; + WRITE_ONCE(vcpu->arch.apf.pageready_pending, true); kvm_apic_set_irq(vcpu, &irq, NULL); } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) { kvm_make_request(KVM_REQ_APF_READY, vcpu); - if (!vcpu->arch.apf.pageready_pending) + + /* Pairs with smp_store_mb() in kvm_set_msr_common(). */ + smp_mb__after_atomic(); + + if (!READ_ONCE(vcpu->arch.apf.pageready_pending)) kvm_vcpu_kick(vcpu); } @@ -11925,107 +13938,69 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } -void kvm_arch_start_assignment(struct kvm *kvm) -{ - if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) - static_call_cond(kvm_x86_start_assignment)(kvm); -} -EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); - -void kvm_arch_end_assignment(struct kvm *kvm) -{ - atomic_dec(&kvm->arch.assigned_device_count); -} -EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); - -bool kvm_arch_has_assigned_device(struct kvm *kvm) +static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + /* + * Non-coherent DMA assignment and de-assignment may affect whether or + * not KVM honors guest PAT, and thus may cause changes in EPT SPTEs + * due to toggling the "ignore PAT" bit. Zap all SPTEs when the first + * (or last) non-coherent device is (un)registered to so that new SPTEs + * with the correct "ignore guest PAT" setting are created. + * + * If KVM always honors guest PAT, however, there is nothing to do. + */ + if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT)) + kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL)); } -EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); void kvm_arch_register_noncoherent_dma(struct kvm *kvm) { - atomic_inc(&kvm->arch.noncoherent_dma_count); + if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) { - atomic_dec(&kvm->arch.noncoherent_dma_count); + if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count)) + kvm_noncoherent_dma_assignment_start_or_stop(kvm); } -EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) { return atomic_read(&kvm->arch.noncoherent_dma_count); } -EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); - -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - -int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) -{ - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - int ret; - - irqfd->producer = prod; - kvm_arch_start_assignment(irqfd->kvm); - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, - prod->irq, irqfd->gsi, 1); - - if (ret) - kvm_arch_end_assignment(irqfd->kvm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma); - return ret; -} - -void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, - struct irq_bypass_producer *prod) +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) { - int ret; - struct kvm_kernel_irqfd *irqfd = - container_of(cons, struct kvm_kernel_irqfd, consumer); - - WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; - - /* - * When producer of consumer is unregistered, we change back to - * remapped mode, so we can re-use the current implementation - * when the irq is masked/disabled or the consumer side (KVM - * int this case doesn't want to receive the interrupts. - */ - ret = static_call(kvm_x86_update_pi_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); - if (ret) - printk(KERN_INFO "irq bypass consumer (token %p) unregistration" - " fails: %d\n", irqfd->consumer.token, ret); - - kvm_arch_end_assignment(irqfd->kvm); + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; } -int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, - uint32_t guest_irq, bool set) +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support initializing guest_memfd memory as shared for VMs + * with private memory (the private vs. shared tracking needs to be moved into + * guest_memfd). + */ +bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm) { - return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set); + return !kvm_arch_has_private_mem(kvm); } -bool kvm_vector_hashing_enabled(void) +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE +int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { - return vector_hashing; + return kvm_x86_call(gmem_prepare)(kvm, pfn, gfn, max_order); } +#endif -bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE +void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) { - return (vcpu->arch.msr_kvm_poll_control & 1) == 0; + kvm_x86_call(gmem_invalidate)(start, end); } -EXPORT_SYMBOL_GPL(kvm_arch_no_poll); - +#endif +#endif int kvm_spec_ctrl_test_value(u64 value) { @@ -12040,27 +14015,28 @@ int kvm_spec_ctrl_test_value(u64 value) local_irq_save(flags); - if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value)) ret = 1; - else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value)) ret = 1; else - wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + wrmsrq(MSR_IA32_SPEC_CTRL, saved_value); local_irq_restore(flags); return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value); void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; struct x86_exception fault; - u32 access = error_code & + u64 access = error_code & (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); if (!(error_code & PFERR_PRESENT_MASK) || - vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { /* * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page * tables probably do not match the TLB. Just proceed @@ -12071,10 +14047,11 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c fault.error_code = error_code; fault.nested_page_fault = false; fault.address = gva; + fault.async_page_fault = false; } vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error); /* * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns @@ -12085,6 +14062,9 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e) { if (r == X86EMUL_PROPAGATE_FAULT) { + if (KVM_BUG_ON(!e, vcpu->kvm)) + return -EIO; + kvm_inject_emulated_page_fault(vcpu, e); return 1; } @@ -12096,13 +14076,11 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, * doesn't seem to be a real use-case behind such requests, just return * KVM_EXIT_INTERNAL_ERROR for now. */ - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + kvm_prepare_emulation_failure_exit(vcpu); return 0; } -EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure); int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) { @@ -12123,12 +14101,16 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return 1; } - pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE); switch (type) { case INVPCID_TYPE_INDIV_ADDR: + /* + * LAM doesn't apply to addresses that are inputs to TLB + * invalidation. + */ if ((!pcid_enabled && (operand.pcid != 0)) || - is_noncanonical_address(operand.gla, vcpu)) { + is_noncanonical_invlpg_address(operand.gla, vcpu)) { kvm_inject_gp(vcpu, 0); return 1; } @@ -12158,10 +14140,11 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return kvm_skip_emulated_instruction(vcpu); default: - BUG(); /* We have already checked above that type <= 3 */ + kvm_inject_gp(vcpu, 0); + return 1; } } -EXPORT_SYMBOL_GPL(kvm_handle_invpcid); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid); static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) { @@ -12246,7 +14229,7 @@ int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write); int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, void *data) @@ -12284,46 +14267,82 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, return 0; } -EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read); -static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) { - memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data, - vcpu->arch.pio.count * vcpu->arch.pio.size); - vcpu->arch.pio.count = 0; + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); return 1; } static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port) { - int ret; + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); - ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) - return ret; + /* memcpy done already by emulator_pio_out. */ + advance_sev_es_emulated_pio(vcpu, count, size); + if (!ret) + break; - vcpu->arch.pio.count = 0; + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; return 0; } static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, - unsigned int port, void *data, unsigned int count) + unsigned int port); + +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) { - int ret; + unsigned count = vcpu->arch.pio.count; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; - ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port, - data, count); - if (ret) { - vcpu->arch.pio.count = 0; - } else { - vcpu->arch.guest_ins_data = data; - vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_pio(vcpu, count, size); + if (!vcpu->arch.sev_pio_count) + return 1; } + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; return 0; } @@ -12331,19 +14350,22 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, unsigned int port, void *data, unsigned int count, int in) { - return in ? kvm_sev_es_ins(vcpu, size, port, data, count) - : kvm_sev_es_outs(vcpu, size, port, data, count); + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); } -EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); @@ -12354,12 +14376,30 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault); + +static int __init kvm_x86_init(void) +{ + kvm_init_xstate_sizes(); + + kvm_mmu_x86_module_init(); + mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); + return 0; +} +module_init(kvm_x86_init); + +static void __exit kvm_x86_exit(void) +{ + WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu)); +} +module_exit(kvm_x86_exit); |
