diff options
Diffstat (limited to 'arch/x86')
38 files changed, 1253 insertions, 656 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dd79e15a3c93..7aed87cbf386 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -245,7 +245,6 @@ config X86 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_RETHOOK - select HAVE_KVM select HAVE_LIVEPATCH if X86_64 select HAVE_MIXED_BREAKPOINTS_REGS select HAVE_MOD_ARCH_SPECIFIC diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 66837b8c67f1..fbc7722b87d1 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -15,7 +15,7 @@ typedef struct { unsigned int irq_spurious_count; unsigned int icr_read_retry_count; #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 47d4c04d103d..749c7411d2f1 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -741,7 +741,7 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); # endif #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 836c170d3087..194dfff84cb1 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -29,7 +29,7 @@ struct irq_desc; extern void fixup_irqs(void); -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void)); #endif diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3a19904c2db6..d18bfb238f66 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -84,11 +84,9 @@ #define HYPERVISOR_CALLBACK_VECTOR 0xf3 /* Vector for KVM to deliver posted interrupt IPI */ -#ifdef CONFIG_HAVE_KVM #define POSTED_INTR_VECTOR 0xf2 #define POSTED_INTR_WAKEUP_VECTOR 0xf1 #define POSTED_INTR_NESTED_VECTOR 0xf0 -#endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index ab24ce207988..110d7f29ca9a 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -103,7 +103,6 @@ KVM_X86_OP(write_tsc_multiplier) KVM_X86_OP(get_exit_info) KVM_X86_OP(check_intercept) KVM_X86_OP(handle_exit_irqoff) -KVM_X86_OP(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging) KVM_X86_OP_OPTIONAL(vcpu_blocking) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 058bc636356a..f852b13aeefe 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -12,11 +12,9 @@ BUILD_BUG_ON(1) * a NULL definition, for example if "static_call_cond()" will be used * at the call sites. */ -KVM_X86_PMU_OP(hw_event_available) -KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) -KVM_X86_PMU_OP(is_valid_rdpmc_ecx) +KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early) KVM_X86_PMU_OP(is_valid_msr) KVM_X86_PMU_OP(get_msr) KVM_X86_PMU_OP(set_msr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 18cbde14cf81..16e07a2eee19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -536,6 +536,7 @@ struct kvm_pmc { #define KVM_PMC_MAX_FIXED 3 #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 + struct kvm_pmu { u8 version; unsigned nr_arch_gp_counters; @@ -1468,6 +1469,15 @@ struct kvm_arch { */ bool shadow_root_allocated; +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * If set, the VM has (or had) an external write tracking user, and + * thus all write tracking metadata has been allocated, even if KVM + * itself isn't using write tracking. + */ + bool external_write_tracking_enabled; +#endif + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; @@ -1665,7 +1675,8 @@ struct kvm_x86_ops { void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); - enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, + bool force_immediate_exit); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1733,8 +1744,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - void (*request_immediate_exit)(struct kvm_vcpu *vcpu); - void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* @@ -1882,8 +1891,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, } #endif /* CONFIG_HYPERV */ +enum kvm_intr_type { + /* Values are arbitrary, but must be non-zero. */ + KVM_HANDLING_IRQ = 1, + KVM_HANDLING_NMI, +}; + +/* Enable perf NMI and timer modes to work, and minimise false positives. */ #define kvm_arch_pmi_in_guest(vcpu) \ - ((vcpu) && (vcpu)->arch.handling_intr_from_guest) + ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ + (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); @@ -2048,7 +2065,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); @@ -2241,7 +2258,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 87a7b917d30e..728c98175b9c 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -358,10 +358,10 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u64 vmpl0_ssp; - u64 vmpl1_ssp; - u64 vmpl2_ssp; - u64 vmpl3_ssp; + u64 pl0_ssp; + u64 pl1_ssp; + u64 pl2_ssp; + u64 pl3_ssp; u64 u_cet; u8 reserved_0xc8[2]; u8 vmpl; diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index c6a7eed03914..266daf5b5b84 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -25,6 +25,7 @@ #define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */ #define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */ #define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */ +#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* 5-level EPT paging */ /* Aggregated APIC features 24-27 */ #define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a448d0964fc0..ad29984d5e39 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -7,6 +7,8 @@ * */ +#include <linux/const.h> +#include <linux/bits.h> #include <linux/types.h> #include <linux/ioctl.h> #include <linux/stddef.h> @@ -40,7 +42,6 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI -#define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 @@ -49,7 +50,6 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS -#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -526,9 +526,278 @@ struct kvm_pmu_event_filter { #define KVM_PMU_EVENT_ALLOW 0 #define KVM_PMU_EVENT_DENY 1 -#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0) +#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0) #define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS) +/* for KVM_CAP_MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; + +/* for KVM_CAP_XEN_HVM */ +#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) +#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) +#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) +#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) +#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) + +struct kvm_xen_hvm_config { + __u32 flags; + __u32 msr; + __u64 blob_addr_32; + __u64 blob_addr_64; + __u8 blob_size_32; + __u8 blob_size_64; + __u8 pad2[30]; +}; + +struct kvm_xen_hvm_attr { + __u16 type; + __u16 pad[3]; + union { + __u8 long_mode; + __u8 vector; + __u8 runstate_update_flag; + union { + __u64 gfn; +#define KVM_XEN_INVALID_GFN ((__u64)-1) + __u64 hva; + } shared_info; + struct { + __u32 send_port; + __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */ + __u32 flags; +#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0) +#define KVM_XEN_EVTCHN_UPDATE (1 << 1) +#define KVM_XEN_EVTCHN_RESET (1 << 2) + /* + * Events sent by the guest are either looped back to + * the guest itself (potentially on a different port#) + * or signalled via an eventfd. + */ + union { + struct { + __u32 port; + __u32 vcpu; + __u32 priority; + } port; + struct { + __u32 port; /* Zero for eventfd */ + __s32 fd; + } eventfd; + __u32 padding[4]; + } deliver; + } evtchn; + __u32 xen_version; + __u64 pad[8]; + } u; +}; + + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0 +#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1 +#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3 +#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */ +#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6 + +struct kvm_xen_vcpu_attr { + __u16 type; + __u16 pad[3]; + union { + __u64 gpa; +#define KVM_XEN_INVALID_GPA ((__u64)-1) + __u64 hva; + __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; + __u32 vcpu_id; + struct { + __u32 port; + __u32 priority; + __u64 expires_ns; + } timer; + __u8 vector; + } u; +}; + +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6 +#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7 +#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8 +/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */ +#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9 + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + /* Attestation report */ + KVM_SEV_GET_ATTESTATION_REPORT, + /* Guest Migration Extension */ + KVM_SEV_SEND_CANCEL, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; +}; + +struct kvm_sev_attestation_report { + __u8 mnonce[16]; + __u64 uaddr; + __u32 len; +}; + +struct kvm_sev_send_start { + __u32 policy; + __u64 pdh_cert_uaddr; + __u32 pdh_cert_len; + __u64 plat_certs_uaddr; + __u32 plat_certs_len; + __u64 amd_certs_uaddr; + __u32 amd_certs_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_send_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_receive_start { + __u32 handle; + __u32 policy; + __u64 pdh_uaddr; + __u32 pdh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_receive_update_data { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) +#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) + +struct kvm_hyperv_eventfd { + __u32 conn_id; + __s32 fd; + __u32 flags; + __u32 padding[3]; +}; + +#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff +#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0) + /* * Masked event layout. * Bits Description @@ -549,10 +818,10 @@ struct kvm_pmu_event_filter { ((__u64)(!!(exclude)) << 55)) #define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \ - (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56)) -#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8)) -#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55)) + (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56)) +#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8)) +#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55)) #define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56) /* for KVM_{GET,SET,HAS}_DEVICE_ATTR */ @@ -560,7 +829,7 @@ struct kvm_pmu_event_filter { #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ /* x86-specific KVM_EXIT_HYPERCALL flags. */ -#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0) #define KVM_X86_DEFAULT_VM 0 #define KVM_X86_SW_PROTECTED_VM 1 diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 6e64b27b2c1e..6bc3456a8ebf 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -92,7 +92,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) /* MSR_KVM_ASYNC_PF_INT */ -#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) +#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0) /* MSR_KVM_MIGRATION_CONTROL */ #define KVM_MIGRATION_READY (1 << 0) diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 03851240c3e3..1640ae76548f 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD); if (ept & VMX_EPT_1GB_PAGE_BIT) c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB); + if (ept & VMX_EPT_PAGE_WALK_5_BIT) + c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL); /* Synthetic APIC features that are aggregates of multiple features. */ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) && diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0cd53fa8c65d..fc37c8d83daf 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = { #ifdef CONFIG_X86_LOCAL_APIC INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt), INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi), -# ifdef CONFIG_HAVE_KVM +# if IS_ENABLED(CONFIG_KVM) INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi), INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 11761c124545..35fde0107901 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -164,7 +164,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) seq_printf(p, "%*s: ", prec, "PIN"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis); @@ -290,7 +290,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif -#ifdef CONFIG_HAVE_KVM +#if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 65ed14b6540b..8c3032a96caf 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -7,7 +7,6 @@ source "virt/kvm/Kconfig" menuconfig VIRTUALIZATION bool "Virtualization" - depends on HAVE_KVM || X86 default y help Say Y here to get to see options for using your Linux host to run other @@ -20,7 +19,6 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select KVM_COMMON @@ -29,9 +27,9 @@ config KVM select HAVE_KVM_PFNCACHE select HAVE_KVM_DIRTY_RING_TSO select HAVE_KVM_DIRTY_RING_ACQ_REL - select IRQ_BYPASS_MANAGER select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_READONLY_MEM select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95ea1a1f7403..999227fc7c66 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -189,9 +189,8 @@ static const struct file_operations mmu_rmaps_stat_fops = { .release = kvm_mmu_rmaps_stat_release, }; -int kvm_arch_create_vm_debugfs(struct kvm *kvm) +void kvm_arch_create_vm_debugfs(struct kvm *kvm) { debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, &mmu_rmaps_stat_fops); - return 0; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e223043ef5b2..5d4c86133453 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1820,22 +1820,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) return X86EMUL_CONTINUE; } -static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) +static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len) { struct segmented_address addr; - rsp_increment(ctxt, -bytes); + rsp_increment(ctxt, -len); addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; - return segmented_write(ctxt, addr, data, bytes); + return segmented_write(ctxt, addr, data, len); } static int em_push(struct x86_emulate_ctxt *ctxt) { /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return push(ctxt, &ctxt->src.val, ctxt->op_bytes); + return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1863,7 +1863,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, void *dest, int len) { int rc; - unsigned long val, change_mask; + unsigned long val = 0; + unsigned long change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT; int cpl = ctxt->ops->cpl(ctxt); @@ -1920,7 +1921,7 @@ static int em_enter(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; rbp = reg_read(ctxt, VCPU_REGS_RBP); - rc = push(ctxt, &rbp, stack_size(ctxt)); + rc = emulate_push(ctxt, &rbp, stack_size(ctxt)); if (rc != X86EMUL_CONTINUE) return rc; assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP), @@ -1954,7 +1955,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt) static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; - unsigned long selector; + unsigned long selector = 0; int rc; rc = emulate_pop(ctxt, &selector, 2); @@ -2000,7 +2001,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) { int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; - u32 val; + u32 val = 0; while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { @@ -2229,7 +2230,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2241,7 +2242,8 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip, cs; + unsigned long eip = 0; + unsigned long cs = 0; int cpl = ctxt->ops->cpl(ctxt); struct desc_struct new_desc; @@ -3011,7 +3013,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ret = em_push(ctxt); } - ops->get_dr(ctxt, 7, &dr7); + dr7 = ops->get_dr(ctxt, 7); ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); return ret; @@ -3184,7 +3186,7 @@ fail: static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long eip; + unsigned long eip = 0; rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3866,15 +3868,6 @@ static int check_cr_access(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) -{ - unsigned long dr7; - - ctxt->ops->get_dr(ctxt, 7, &dr7); - - return dr7 & DR7_GD; -} - static int check_dr_read(struct x86_emulate_ctxt *ctxt) { int dr = ctxt->modrm_reg; @@ -3887,10 +3880,10 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt) if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) return emulate_ud(ctxt); - if (check_dr7_gd(ctxt)) { + if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) { ulong dr6; - ctxt->ops->get_dr(ctxt, 6, &dr6); + dr6 = ctxt->ops->get_dr(ctxt, 6); dr6 &= ~DR_TRAP_BITS; dr6 |= DR6_BD | DR6_ACTIVE_LOW; ctxt->ops->set_dr(ctxt, 6, dr6); @@ -3962,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) * protected mode. */ if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || - ctxt->ops->check_pmc(ctxt, rcx)) + ctxt->ops->check_rdpmc_early(ctxt, rcx)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; @@ -4505,11 +4498,11 @@ static const struct instr_dual instr_dual_0f_38_f1 = { }; static const struct gprefix three_byte_0f_38_f0 = { - ID(0, &instr_dual_0f_38_f0), N, N, N + ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N }; static const struct gprefix three_byte_0f_38_f1 = { - ID(0, &instr_dual_0f_38_f1), N, N, N + ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N }; /* @@ -5449,7 +5442,7 @@ twobyte_insn: ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg); break; case 0x21: /* mov from dr to reg */ - ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); + ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg); break; case 0x40 ... 0x4f: /* cmov */ if (test_cc(ctxt->b, ctxt->eflags)) diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index e6d149825169..5382646162a3 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -203,12 +203,12 @@ struct x86_emulate_ops { ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); - void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); - int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); + int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1edf93ee3395..cf37586f0466 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -41,6 +41,7 @@ #include "ioapic.h" #include "trace.h" #include "x86.h" +#include "xen.h" #include "cpuid.h" #include "hyperv.h" #include "smm.h" @@ -124,6 +125,9 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap) return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); + __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ); __read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ); @@ -499,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } /* Check if there are APF page ready requests pending */ - if (enabled) + if (enabled) { kvm_make_request(KVM_REQ_APF_READY, apic->vcpu); + kvm_xen_sw_enable_lapic(apic->vcpu); + } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) @@ -2466,8 +2472,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!vcpu->arch.apic) + if (!vcpu->arch.apic) { + static_branch_dec(&kvm_has_noapic_vcpu); return; + } hrtimer_cancel(&apic->lapic_timer.timer); @@ -2809,6 +2817,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) ASSERT(vcpu != NULL); + if (!irqchip_in_kernel(vcpu->kvm)) { + static_branch_inc(&kvm_has_noapic_vcpu); + return 0; + } + apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) goto nomem; @@ -2847,6 +2860,21 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); + /* + * Defer evaluating inhibits until the vCPU is first run, as this vCPU + * will not get notified of any changes until this vCPU is visible to + * other vCPUs (marked online and added to the set of vCPUs). + * + * Opportunistically mark APICv active as VMX in particularly is highly + * unlikely to have inhibits. Ignore the current per-VM APICv state so + * that vCPU creation is guaranteed to run with a deterministic value, + * the request will ensure the vCPU gets the correct state before VM-Entry. + */ + if (enable_apicv) { + apic->apicv_active = true; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } + return 0; nomem_free_apic: kfree(apic); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7c9fce512625..992e651540e8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (WARN_ON_ONCE(!sp)) return; - if (is_tdp_mmu_page(sp)) + if (is_tdp_mmu_page(sp)) { + lockdep_assert_held_read(&kvm->mmu_lock); kvm_tdp_mmu_put_root(kvm, sp); - else if (!--sp->root_count && sp->role.invalid) - kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } else { + lockdep_assert_held_write(&kvm->mmu_lock); + if (!--sp->root_count && sp->role.invalid) + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + } *root_hpa = INVALID_PAGE; } @@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free) { + bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct; int i; LIST_HEAD(invalid_list); bool free_active_root; @@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, return; } - write_lock(&kvm->mmu_lock); + if (is_tdp_mmu) + read_lock(&kvm->mmu_lock); + else + write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) @@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, mmu->root.pgd = 0; } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - write_unlock(&kvm->mmu_lock); + if (is_tdp_mmu) { + read_unlock(&kvm->mmu_lock); + WARN_ON_ONCE(!list_empty(&invalid_list)); + } else { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + write_unlock(&kvm->mmu_lock); + } } EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); @@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) unsigned i; int r; + if (tdp_mmu_enabled) + return kvm_tdp_mmu_alloc_root(vcpu); + write_lock(&vcpu->kvm->mmu_lock); r = make_mmu_pages_available(vcpu); if (r < 0) goto out_unlock; - if (tdp_mmu_enabled) { - root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); - mmu->root.hpa = root; - } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { + if (shadow_root_level >= PT64_ROOT_4LEVEL) { root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level); mmu->root.hpa = root; } else if (shadow_root_level == PT32E_ROOT_LEVEL) { @@ -7039,9 +7052,7 @@ int kvm_mmu_vendor_module_init(void) kvm_mmu_reset_all_pte_masks(); - pte_list_desc_cache = kmem_cache_create("pte_list_desc", - sizeof(struct pte_list_desc), - 0, SLAB_ACCOUNT, NULL); + pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT); if (!pte_list_desc_cache) goto out; diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index c87da11f3a04..f6448284c18e 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -20,10 +20,23 @@ #include "mmu_internal.h" #include "page_track.h" +static bool kvm_external_write_tracking_enabled(struct kvm *kvm) +{ +#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING + /* + * Read external_write_tracking_enabled before related pointers. Pairs + * with the smp_store_release in kvm_page_track_write_tracking_enable(). + */ + return smp_load_acquire(&kvm->arch.external_write_tracking_enabled); +#else + return false; +#endif +} + bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) { - return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) || - !tdp_enabled || kvm_shadow_root_allocated(kvm); + return kvm_external_write_tracking_enabled(kvm) || + kvm_shadow_root_allocated(kvm) || !tdp_enabled; } void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) @@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm) return init_srcu_struct(&head->track_srcu); } +static int kvm_enable_external_write_tracking(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; + int r = 0, i, bkt; + + mutex_lock(&kvm->slots_arch_lock); + + /* + * Check for *any* write tracking user (not just external users) under + * lock. This avoids unnecessary work, e.g. if KVM itself is using + * write tracking, or if two external users raced when registering. + */ + if (kvm_page_track_write_tracking_enabled(kvm)) + goto out_success; + + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(slot, bkt, slots) { + /* + * Intentionally do NOT free allocations on failure to + * avoid having to track which allocations were made + * now versus when the memslot was created. The + * metadata is guaranteed to be freed when the slot is + * freed, and will be kept/used if userspace retries + * the failed ioctl() instead of killing the VM. + */ + r = kvm_page_track_write_tracking_alloc(slot); + if (r) + goto out_unlock; + } + } + +out_success: + /* + * Ensure that external_write_tracking_enabled becomes true strictly + * after all the related pointers are set. + */ + smp_store_release(&kvm->arch.external_write_tracking_enabled, true); +out_unlock: + mutex_unlock(&kvm->slots_arch_lock); + return r; +} + /* * register the notifier so that event interception for the tracked guest * pages can be received. @@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; + int r; if (!kvm || kvm->mm != current->mm) return -ESRCH; + if (!kvm_external_write_tracking_enabled(kvm)) { + r = kvm_enable_external_write_tracking(kvm); + if (r) + return r; + } + kvm_get_kvm(kvm); head = &kvm->arch.track_notifier_head; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 6ae19b4ee5b1..d078157e62aa 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * If shared is set, this function is operating under the MMU lock in read * mode. */ -#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ - for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ - ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ - _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \ + for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ + ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ + _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ + if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ } else #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ @@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ -#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ - list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ - if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ - kvm_mmu_page_as_id(_root) != _as_id) { \ +#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \ + list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ + ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ + ((_only_valid) && (_root)->role.invalid))) { \ } else +#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, false) + +#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root(_kvm, _root, _as_id, true) + static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); } -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) +int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) { - union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; + struct kvm_mmu *mmu = vcpu->arch.mmu; + union kvm_mmu_page_role role = mmu->root_role; + int as_id = kvm_mmu_role_as_id(role); struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page *root; - lockdep_assert_held_write(&kvm->mmu_lock); + /* + * Check for an existing root before acquiring the pages lock to avoid + * unnecessary serialization if multiple vCPUs are loading a new root. + * E.g. when bringing up secondary vCPUs, KVM will already have created + * a valid root on behalf of the primary vCPU. + */ + read_lock(&kvm->mmu_lock); + + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { + if (root->role.word == role.word) + goto out_read_unlock; + } + + spin_lock(&kvm->arch.tdp_mmu_pages_lock); /* - * Check for an existing root before allocating a new one. Note, the - * role check prevents consuming an invalid root. + * Recheck for an existing root after acquiring the pages lock, another + * vCPU may have raced ahead and created a new usable root. Manually + * walk the list of roots as the standard macros assume that the pages + * lock is *not* held. WARN if grabbing a reference to a usable root + * fails, as the last reference to a root can only be put *after* the + * root has been invalidated, which requires holding mmu_lock for write. */ - for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { + list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { if (root->role.word == role.word && - kvm_tdp_mmu_get_root(root)) - goto out; + !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) + goto out_spin_unlock; } root = tdp_mmu_alloc_sp(vcpu); @@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). */ refcount_set(&root->tdp_mmu_root_count, 2); - - spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); -out: - return __pa(root->spt); +out_spin_unlock: + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); +out_read_unlock: + read_unlock(&kvm->mmu_lock); + /* + * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest + * and actually consuming the root if it's invalidated after dropping + * mmu_lock, and the root can't be freed as this vCPU holds a reference. + */ + mmu->root.hpa = __pa(root->spt); + mmu->root.pgd = 0; + return 0; } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, @@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); /* - * To avoid RCU stalls due to recursively removing huge swaths of SPs, - * split the zap into two passes. On the first pass, zap at the 1gb - * level, and then zap top-level SPs on the second pass. "1gb" is not - * arbitrary, as KVM must be able to zap a 1gb shadow page without - * inducing a stall to allow in-place replacement with a 1gb hugepage. + * Zap roots in multiple passes of decreasing granularity, i.e. zap at + * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all + * preempt models) or mmu_lock contention (full or real-time models). + * Zapping at finer granularity marginally increases the total time of + * the zap, but in most cases the zap itself isn't latency sensitive. * - * Because zapping a SP recurses on its children, stepping down to - * PG_LEVEL_4K in the iterator itself is unnecessary. + * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps + * in order to mimic the page fault path, which can replace a 1GiB page + * table with an equivalent 1GiB hugepage, i.e. can get saddled with + * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This + * allows verifying that KVM can safely zap 1GiB regions, e.g. without + * inducing RCU stalls, without relying on a relatively rare event + * (zapping roots is orders of magnitude more common). Note, because + * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K + * in the iterator itself is unnecessary. */ + if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); + __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); + } __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); __tdp_mmu_zap_root(kvm, root, shared, root->role.level); @@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_iter_set_spte(kvm, &iter, 0); - flush = true; + + /* + * Zappings SPTEs in invalid roots doesn't require a TLB flush, + * see kvm_tdp_mmu_zap_invalidated_roots() for details. + */ + if (!root->role.invalid) + flush = true; } rcu_read_unlock(); @@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, } /* - * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns - * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or - * more SPTEs were zapped since the MMU lock was last acquired. + * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. + * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if + * one or more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { struct kvm_mmu_page *root; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root_yield_safe(kvm, root) + for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); return flush; @@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) * the VM is being destroyed). * * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. - * See kvm_tdp_mmu_get_vcpu_root_hpa(). + * See kvm_tdp_mmu_alloc_root(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { @@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); } @@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, bool spte_set = false; lockdep_assert_held_write(&kvm->mmu_lock); - for_each_tdp_mmu_root(kvm, root, slot->as_id) + for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) spte_set |= write_protect_gfn(kvm, root, gfn, min_level); return spte_set; diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 20d97aa46c49..6e1ea04ca885 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -10,7 +10,7 @@ void kvm_mmu_init_tdp_mmu(struct kvm *kvm); void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm); -hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); +int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu); __must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root) { diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 87cc6c8809ad..c397b28e3d1b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -29,6 +29,9 @@ struct x86_pmu_capability __read_mostly kvm_pmu_cap; EXPORT_SYMBOL_GPL(kvm_pmu_cap); +struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; +EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); + /* Precise Distribution of Instructions Retired (PDIR) */ static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), @@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters - * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters */ @@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, int idx) { - int fixed_idx = idx - INTEL_PMC_IDX_FIXED; + int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; if (filter->action == KVM_PMU_EVENT_DENY && test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) @@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc) static bool pmc_event_is_allowed(struct kvm_pmc *pmc) { return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && - static_call(kvm_x86_pmu_hw_event_available)(pmc) && check_pmu_event_filter(pmc); } -static void reprogram_counter(struct kvm_pmc *pmc) +static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); u64 eventsel = pmc->eventsel; @@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) emulate_overflow = pmc_pause_counter(pmc); if (!pmc_event_is_allowed(pmc)) - goto reprogram_complete; + return 0; if (emulate_overflow) __kvm_perf_overflow(pmc, false); @@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) { fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); if (fixed_ctr_ctrl & 0x1) eventsel |= ARCH_PERFMON_EVENTSEL_OS; if (fixed_ctr_ctrl & 0x2) @@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc) } if (pmc->current_config == new_config && pmc_resume_counter(pmc)) - goto reprogram_complete; + return 0; pmc_release_perf_event(pmc); pmc->current_config = new_config; - /* - * If reprogramming fails, e.g. due to contention, leave the counter's - * regprogram bit set, i.e. opportunistically try again on the next PMU - * refresh. Don't make a new request as doing so can stall the guest - * if reprogramming repeatedly fails. - */ - if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW, - (eventsel & pmu->raw_event_mask), - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT)) - return; - -reprogram_complete: - clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + return pmc_reprogram_counter(pmc, PERF_TYPE_RAW, + (eventsel & pmu->raw_event_mask), + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); } void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; int bit; - for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit); + bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX); - if (unlikely(!pmc)) { - clear_bit(bit, pmu->reprogram_pmi); - continue; - } + /* + * The reprogramming bitmap can be written asynchronously by something + * other than the task that holds vcpu->mutex, take care to clear only + * the bits that will actually processed. + */ + BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t)); + atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi); - reprogram_counter(pmc); + kvm_for_each_pmc(pmu, pmc, bit, bitmap) { + /* + * If reprogramming fails, e.g. due to contention, re-set the + * regprogram bit set, i.e. opportunistically try again on the + * next PMU refresh. Don't make a new request as doing so can + * stall the guest if reprogramming repeatedly fails. + */ + if (reprogram_counter(pmc)) + set_bit(pmc->idx, pmu->reprogram_pmi); } /* @@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) kvm_pmu_cleanup(vcpu); } -/* check if idx is a valid index to access PMU */ -bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) { - return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx); + /* + * On Intel, VMX interception has priority over RDPMC exceptions that + * aren't already handled by the emulator, i.e. there are no additional + * check needed for Intel PMUs. + * + * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, + * i.e. an invalid PMC results in a #GP, not #VMEXIT. + */ + if (!kvm_pmu_ops.check_rdpmc_early) + return 0; + + return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { - bool fast_mode = idx & (1u << 31); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; - u64 mask = fast_mode ? ~0u : ~0ull; + u64 mask = ~0ull; if (!pmu->version) return 1; @@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX); - for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - if (!pmc) - continue; - + kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { pmc_stop_counter(pmc); pmc->counter = 0; pmc->emulated_counter = 0; @@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) return; @@ -750,8 +761,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) */ kvm_pmu_reset(vcpu); - bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX); - static_call(kvm_x86_pmu_refresh)(vcpu); + pmu->version = 0; + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; + pmu->global_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + pmu->pebs_enable_mask = ~0ull; + pmu->pebs_data_cfg_mask = ~0ull; + bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + + if (vcpu->kvm->arch.enable_pmu) + static_call(kvm_x86_pmu_refresh)(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -776,10 +801,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, pmu->pmc_in_use, X86_PMC_IDX_MAX); - for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); - - if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) + kvm_for_each_pmc(pmu, pmc, i, bitmask) { + if (pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); } @@ -799,13 +822,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) kvm_pmu_request_counter_reprogram(pmc); } -static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc, - unsigned int perf_hw_id) -{ - return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) & - AMD64_RAW_EVENT_MASK_NB); -} - static inline bool cpl_is_matched(struct kvm_pmc *pmc) { bool select_os, select_user; @@ -817,29 +833,56 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc) select_user = config & ARCH_PERFMON_EVENTSEL_USR; } else { config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED); + pmc->idx - KVM_FIXED_PMC_BASE_IDX); select_os = config & 0x1; select_user = config & 0x2; } + /* + * Skip the CPL lookup, which isn't free on Intel, if the result will + * be the same regardless of the CPL. + */ + if (select_os == select_user) + return select_os; + return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; } -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id) +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) { + DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; int i; - for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) { - pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i); + BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); - if (!pmc || !pmc_event_is_allowed(pmc)) + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); + else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx, + (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) + return; + + kvm_for_each_pmc(pmu, pmc, i, bitmap) { + /* + * Ignore checks for edge detect (all events currently emulated + * but KVM are always rising edges), pin control (unsupported + * by modern CPUs), and counter mask and its invert flag (KVM + * doesn't emulate multiple events in a single clock cycle). + * + * Note, the uppermost nibble of AMD's mask overlaps Intel's + * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved + * bits (bits 35:34). Checking the "in HLE/RTM transaction" + * flags is correct as the vCPU can't be in a transaction if + * KVM is emulating an instruction. Checking the reserved bits + * might be wrong if they are defined in the future, but so + * could ignoring them, so do the simple thing for now. + */ + if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || + !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) continue; - /* Ignore checks for edge detect, pin control, invert and CMASK bits */ - if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc)) - kvm_pmu_incr_counter(pmc); + kvm_pmu_incr_counter(pmc); } } EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7caeb3d8d4fd..4d52b0b539ba 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -4,6 +4,8 @@ #include <linux/nospec.h> +#include <asm/kvm_host.h> + #define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) #define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) #define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) @@ -18,13 +20,18 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED + +struct kvm_pmu_emulated_event_selectors { + u64 INSTRUCTIONS_RETIRED; + u64 BRANCH_INSTRUCTIONS_RETIRED; +}; + struct kvm_pmu_ops { - bool (*hw_event_available)(struct kvm_pmc *pmc); - struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); - bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); + int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -55,6 +62,38 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } +/* + * KVM tracks all counters in 64-bit bitmaps, with general purpose counters + * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0 + * is tracked internally via index 32. On Intel, (AMD doesn't support fixed + * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL + * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the + * amounter of boilerplate needed to iterate over PMCs *and* simplifies common + * enabling/disable/reset operations. + * + * WARNING! This helper is only for lookups that are initiated by KVM, it is + * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw + * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX + * for RDPMC, not by adding 32 to the fixed counter index). + */ +static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx) +{ + if (idx < pmu->nr_arch_gp_counters) + return &pmu->gp_counters[idx]; + + idx -= KVM_FIXED_PMC_BASE_IDX; + if (idx >= 0 && idx < pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[idx]; + + return NULL; +} + +#define kvm_for_each_pmc(pmu, pmc, i, bitmap) \ + for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \ + if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \ + continue; \ + else \ + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -131,12 +170,13 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) if (pmc_is_fixed(pmc)) return fixed_ctrl_field(pmu->fixed_ctr_ctrl, - pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3; return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; } extern struct x86_pmu_capability kvm_pmu_cap; +extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel; static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { @@ -178,6 +218,11 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) pmu_ops->MAX_NR_GP_COUNTERS); kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed, KVM_PMC_MAX_FIXED); + + kvm_pmu_eventsel.INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = + perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) @@ -216,7 +261,7 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); +int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -225,7 +270,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu); void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); -void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id); +void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel); bool is_vmware_backdoor_pmc(u32 pmc_idx); diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index dc3d95fdca7d..d06d43d8d2aa 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -184,7 +184,6 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, struct kvm_smram_state_32 *smram) { struct desc_ptr dt; - unsigned long val; int i; smram->cr0 = kvm_read_cr0(vcpu); @@ -195,10 +194,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, for (i = 0; i < 8; i++) smram->gprs[i] = kvm_register_read_raw(vcpu, i); - kvm_get_dr(vcpu, 6, &val); - smram->dr6 = (u32)val; - kvm_get_dr(vcpu, 7, &val); - smram->dr7 = (u32)val; + smram->dr6 = (u32)vcpu->arch.dr6; + smram->dr7 = (u32)vcpu->arch.dr7; enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR); enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR); @@ -231,7 +228,6 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, struct kvm_smram_state_64 *smram) { struct desc_ptr dt; - unsigned long val; int i; for (i = 0; i < 16; i++) @@ -240,11 +236,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, smram->rip = kvm_rip_read(vcpu); smram->rflags = kvm_get_rflags(vcpu); - - kvm_get_dr(vcpu, 6, &val); - smram->dr6 = val; - kvm_get_dr(vcpu, 7, &val); - smram->dr7 = val; + smram->dr6 = vcpu->arch.dr6; + smram->dr7 = vcpu->arch.dr7; smram->cr0 = kvm_read_cr0(vcpu); smram->cr3 = kvm_read_cr3(vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b6a7ad4d6914..dfcc38bd97d3 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -25,7 +25,7 @@ enum pmu_type { PMU_TYPE_EVNTSEL, }; -static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx) { unsigned int num_counters = pmu->nr_arch_gp_counters; @@ -70,28 +70,24 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, return NULL; } - return amd_pmc_idx_to_pmc(pmu, idx); + return amd_pmu_get_pmc(pmu, idx); } -static bool amd_hw_event_available(struct kvm_pmc *pmc) -{ - return true; -} - -static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - idx &= ~(3u << 30); + if (idx >= pmu->nr_arch_gp_counters) + return -EINVAL; - return idx < pmu->nr_arch_gp_counters; + return 0; } /* idx is the ECX register of RDPMC instruction */ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask) { - return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); + return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx); } static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) @@ -233,11 +229,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } struct kvm_pmu_ops amd_pmu_ops __initdata = { - .hw_event_available = amd_hw_event_available, - .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, - .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx, + .check_rdpmc_early = amd_check_rdpmc_early, .is_valid_msr = amd_is_valid_msr, .get_msr = amd_pmu_get_msr, .set_msr = amd_pmu_set_msr, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 272d5ed37ce7..d1a9f9951635 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2735,7 +2735,6 @@ static int dr_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int reg, dr; - unsigned long val; int err = 0; /* @@ -2763,11 +2762,9 @@ static int dr_interception(struct kvm_vcpu *vcpu) dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; if (dr >= 16) { /* mov to DRn */ dr -= 16; - val = kvm_register_read(vcpu, reg); - err = kvm_set_dr(vcpu, dr, val); + err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); } else { - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); } return kvm_complete_insn_gp(vcpu, err); @@ -4092,6 +4089,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && to_svm(vcpu)->vmcb->control.exit_info_1) return handle_fastpath_set_msr_irqoff(vcpu); @@ -4115,12 +4115,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in guest_state_exit_irqoff(); } -static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) +static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_svm *svm = to_svm(vcpu); bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; @@ -4139,9 +4140,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * is enough to force an immediate vmexit. */ disable_nmi_singlestep(svm); - smp_send_reschedule(vcpu->cpu); + force_immediate_exit = true; } + if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); + pre_svm_run(vcpu); sync_lapic_to_cr8(vcpu); @@ -4237,9 +4241,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - return svm_exit_handlers_fastpath(vcpu); } @@ -5007,8 +5008,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, - .request_immediate_exit = __kvm_request_immediate_exit, - .sched_in = svm_sched_in, .nested_ops = &svm_nested_ops, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 83843379813e..88659de4d2a7 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -15,20 +15,23 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_entry, - TP_PROTO(struct kvm_vcpu *vcpu), - TP_ARGS(vcpu), + TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit), + TP_ARGS(vcpu, force_immediate_exit), TP_STRUCT__entry( __field( unsigned int, vcpu_id ) __field( unsigned long, rip ) + __field( bool, immediate_exit ) ), TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; __entry->rip = kvm_rip_read(vcpu); + __entry->immediate_exit = force_immediate_exit; ), - TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip) + TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip, + __entry->immediate_exit ? "[immediate exit]" : "") ); /* diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 6329a306856b..d05ddf751491 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3606,7 +3606,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) return nested_vmx_failInvalid(vcpu); @@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_dr7 = vcpu->arch.dr7; if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 315c7c2ba89b..12ade343a17e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -20,53 +20,19 @@ #include "nested.h" #include "pmu.h" -#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) - -enum intel_pmu_architectural_events { - /* - * The order of the architectural events matters as support for each - * event is enumerated via CPUID using the index of the event. - */ - INTEL_ARCH_CPU_CYCLES, - INTEL_ARCH_INSTRUCTIONS_RETIRED, - INTEL_ARCH_REFERENCE_CYCLES, - INTEL_ARCH_LLC_REFERENCES, - INTEL_ARCH_LLC_MISSES, - INTEL_ARCH_BRANCHES_RETIRED, - INTEL_ARCH_BRANCHES_MISPREDICTED, - - NR_REAL_INTEL_ARCH_EVENTS, - - /* - * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a. - * TSC reference cycles. The architectural reference cycles event may - * or may not actually use the TSC as the reference, e.g. might use the - * core crystal clock or the bus clock (yeah, "architectural"). - */ - PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS, - NR_INTEL_ARCH_EVENTS, -}; +/* + * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX + * to encode the "type" of counter to read, i.e. this is not a "base". And to + * further confuse things, non-architectural PMUs use bit 31 as a flag for + * "fast" reads, whereas the "type" is an explicit value. + */ +#define INTEL_RDPMC_GP 0 +#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE -static struct { - u8 eventsel; - u8 unit_mask; -} const intel_arch_events[] = { - [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 }, - [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 }, - [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 }, - [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f }, - [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 }, - [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 }, - [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 }, - [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 }, -}; +#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16) +#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0) -/* mapping between fixed pmc index and intel_arch_events array */ -static int fixed_pmc_events[] = { - [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED, - [1] = INTEL_ARCH_CPU_CYCLES, - [2] = PSEUDO_ARCH_REFERENCE_CYCLES, -}; +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) { @@ -84,77 +50,61 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); - __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); + __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use); kvm_pmu_request_counter_reprogram(pmc); } } -static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) -{ - if (pmc_idx < INTEL_PMC_IDX_FIXED) { - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, - MSR_P6_EVNTSEL0); - } else { - u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; - - return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); - } -} - -static bool intel_hw_event_available(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT; - u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - int i; - - BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS); - - /* - * Disallow events reported as unavailable in guest CPUID. Note, this - * doesn't apply to pseudo-architectural events. - */ - for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) { - if (intel_arch_events[i].eventsel != event_select || - intel_arch_events[i].unit_mask != unit_mask) - continue; - - return pmu->available_event_types & BIT(i); - } - - return true; -} - -static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) -{ - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); - - idx &= ~(3u << 30); - - return fixed ? idx < pmu->nr_arch_fixed_counters - : idx < pmu->nr_arch_gp_counters; -} - static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask) { + unsigned int type = idx & INTEL_RDPMC_TYPE_MASK; struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - bool fixed = idx & (1u << 30); struct kvm_pmc *counters; unsigned int num_counters; + u64 bitmask; - idx &= ~(3u << 30); - if (fixed) { + /* + * The encoding of ECX for RDPMC is different for architectural versus + * non-architecturals PMUs (PMUs with version '0'). For architectural + * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC + * index. For non-architectural PMUs, bit 31 is a "fast" flag, and + * bits 30:0 specify the PMC index. + * + * Yell and reject attempts to read PMCs for a non-architectural PMU, + * as KVM doesn't support such PMUs. + */ + if (WARN_ON_ONCE(!pmu->version)) + return NULL; + + /* + * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs + * are supported on all architectural PMUs, i.e. on all virtual PMUs + * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+, + * but the type itself is still valid, i.e. let RDPMC fail due to + * accessing a non-existent counter. Reject attempts to read all other + * types, which are unknown/unsupported. + */ + switch (type) { + case INTEL_RDPMC_FIXED: counters = pmu->fixed_counters; num_counters = pmu->nr_arch_fixed_counters; - } else { + bitmask = pmu->counter_bitmask[KVM_PMC_FIXED]; + break; + case INTEL_RDPMC_GP: counters = pmu->gp_counters; num_counters = pmu->nr_arch_gp_counters; + bitmask = pmu->counter_bitmask[KVM_PMC_GP]; + break; + default: + return NULL; } + + idx &= INTEL_RDPMC_INDEX_MASK; if (idx >= num_counters) return NULL; - *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP]; + + *mask &= bitmask; return &counters[array_index_nospec(idx, num_counters)]; } @@ -464,20 +414,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } -static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu) +/* + * Map fixed counter events to architectural general purpose event encodings. + * Perf doesn't provide APIs to allow KVM to directly program a fixed counter, + * and so KVM instead programs the architectural event to effectively request + * the fixed counter. Perf isn't guaranteed to use a fixed counter and may + * instead program the encoding into a general purpose counter, e.g. if a + * different perf_event is already utilizing the requested counter, but the end + * result is the same (ignoring the fact that using a general purpose counter + * will likely exacerbate counter contention). + * + * Forcibly inlined to allow asserting on @index at build time, and there should + * never be more than one user. + */ +static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index) { - int i; - - BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED); + const enum perf_hw_id fixed_pmc_perf_ids[] = { + [0] = PERF_COUNT_HW_INSTRUCTIONS, + [1] = PERF_COUNT_HW_CPU_CYCLES, + [2] = PERF_COUNT_HW_REF_CPU_CYCLES, + }; + u64 eventsel; - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - int index = array_index_nospec(i, KVM_PMC_MAX_FIXED); - struct kvm_pmc *pmc = &pmu->fixed_counters[index]; - u32 event = fixed_pmc_events[index]; + BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED); + BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED); - pmc->eventsel = (intel_arch_events[event].unit_mask << 8) | - intel_arch_events[event].eventsel; - } + /* + * Yell if perf reports support for a fixed counter but perf doesn't + * have a known encoding for the associated general purpose event. + */ + eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]); + WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed); + return eventsel; } static void intel_pmu_refresh(struct kvm_vcpu *vcpu) @@ -491,19 +459,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) u64 counter_mask; int i; - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - pmu->raw_event_mask = X86_RAW_EVENT_MASK; - pmu->global_ctrl_mask = ~0ull; - pmu->global_status_mask = ~0ull; - pmu->fixed_ctr_ctrl_mask = ~0ull; - pmu->pebs_enable_mask = ~0ull; - pmu->pebs_data_cfg_mask = ~0ull; - memset(&lbr_desc->records, 0, sizeof(lbr_desc->records)); /* @@ -515,8 +470,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; entry = kvm_find_cpuid_entry(vcpu, 0xa); - if (!entry || !vcpu->kvm->arch.enable_pmu) + if (!entry) return; + eax.full = entry->eax; edx.full = entry->edx; @@ -543,13 +499,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) kvm_pmu_cap.bit_width_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; - setup_fixed_pmc_eventsel(pmu); } for (i = 0; i < pmu->nr_arch_fixed_counters; i++) pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX)); pmu->global_ctrl_mask = counter_mask; /* @@ -593,7 +548,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { pmu->fixed_ctr_ctrl_mask &= - ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4)); + ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4)); } pmu->pebs_data_cfg_mask = ~0xff00000full; } else { @@ -619,8 +574,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) for (i = 0; i < KVM_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX; pmu->fixed_counters[i].current_config = 0; + pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i); } lbr_desc->records.nr = 0; @@ -748,11 +704,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmc *pmc = NULL; int bit, hw_idx; - for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl, - X86_PMC_IDX_MAX) { - pmc = intel_pmc_idx_to_pmc(pmu, bit); - - if (!pmc || !pmc_speculative_in_use(pmc) || + kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) { + if (!pmc_speculative_in_use(pmc) || !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; @@ -767,11 +720,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } struct kvm_pmu_ops intel_pmu_ops __initdata = { - .hw_event_available = intel_hw_event_available, - .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, .is_valid_msr = intel_is_valid_msr, .get_msr = intel_pmu_get_msr, .set_msr = intel_pmu_set_msr, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 305237dcba88..c37a89eda90f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -50,6 +50,8 @@ #include <asm/spec-ctrl.h> #include <asm/vmx.h> +#include <trace/events/ipi.h> + #include "capabilities.h" #include "cpuid.h" #include "hyperv.h" @@ -160,7 +162,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); /* * List of MSRs that can be directly passed to the guest. - * In addition to these x2apic and PT MSRs are handled specially. + * In addition to these x2apic, PT and LBR MSRs are handled specially. */ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { MSR_IA32_SPEC_CTRL, @@ -668,25 +670,14 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) return flexpriority_enabled && lapic_in_kernel(vcpu); } -static int possible_passthrough_msr_slot(u32 msr) -{ - u32 i; - - for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) - if (vmx_possible_passthrough_msrs[i] == msr) - return i; - - return -ENOENT; -} - -static bool is_valid_passthrough_msr(u32 msr) +static int vmx_get_passthrough_msr_slot(u32 msr) { - bool r; + int i; switch (msr) { case 0x800 ... 0x8ff: /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ - return true; + return -ENOENT; case MSR_IA32_RTIT_STATUS: case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: @@ -701,14 +692,16 @@ static bool is_valid_passthrough_msr(u32 msr) case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ - return true; + return -ENOENT; } - r = possible_passthrough_msr_slot(msr) != -ENOENT; - - WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { + if (vmx_possible_passthrough_msrs[i] == msr) + return i; + } - return r; + WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); + return -ENOENT; } struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) @@ -1291,8 +1284,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - vmx->req_immediate_exit = false; - /* * Note that guest MSRs to be saved/restored can also be changed * when guest state is loaded. This happens when guest transitions @@ -3964,6 +3955,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + int idx; if (!cpu_has_vmx_msr_bitmap()) return; @@ -3973,16 +3965,13 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filters change. - */ - if (is_valid_passthrough_msr(msr)) { - int idx = possible_passthrough_msr_slot(msr); - - if (idx != -ENOENT) { - if (type & MSR_TYPE_R) - clear_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - clear_bit(idx, vmx->shadow_msr_intercept.write); - } + */ + idx = vmx_get_passthrough_msr_slot(msr); + if (idx >= 0) { + if (type & MSR_TYPE_R) + clear_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + clear_bit(idx, vmx->shadow_msr_intercept.write); } if ((type & MSR_TYPE_R) && @@ -4008,6 +3997,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + int idx; if (!cpu_has_vmx_msr_bitmap()) return; @@ -4017,16 +4007,13 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) /* * Mark the desired intercept state in shadow bitmap, this is needed * for resync when the MSR filter changes. - */ - if (is_valid_passthrough_msr(msr)) { - int idx = possible_passthrough_msr_slot(msr); - - if (idx != -ENOENT) { - if (type & MSR_TYPE_R) - set_bit(idx, vmx->shadow_msr_intercept.read); - if (type & MSR_TYPE_W) - set_bit(idx, vmx->shadow_msr_intercept.write); - } + */ + idx = vmx_get_passthrough_msr_slot(msr); + if (idx >= 0) { + if (type & MSR_TYPE_R) + set_bit(idx, vmx->shadow_msr_intercept.read); + if (type & MSR_TYPE_W) + set_bit(idx, vmx->shadow_msr_intercept.write); } if (type & MSR_TYPE_R) @@ -4137,6 +4124,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 i; + if (!cpu_has_vmx_msr_bitmap()) + return; + /* * Redo intercept permissions for MSRs that KVM is passing through to * the guest. Disabling interception will check the new MSR filter and @@ -5576,10 +5566,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - unsigned long val; - - kvm_get_dr(vcpu, dr, &val); - kvm_register_write(vcpu, reg, val); + kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); err = 0; } else { err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); @@ -6001,22 +5988,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) +static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!vmx->req_immediate_exit && - !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { - kvm_lapic_expired_hv_timer(vcpu); + /* + * In the *extremely* unlikely scenario that this is a spurious VM-Exit + * due to the timer expiring while it was "soft" disabled, just eat the + * exit and re-enter the guest. + */ + if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) return EXIT_FASTPATH_REENTER_GUEST; - } - return EXIT_FASTPATH_NONE; + /* + * If the timer expired because KVM used it to force an immediate exit, + * then mission accomplished. + */ + if (force_immediate_exit) + return EXIT_FASTPATH_EXIT_HANDLED; + + /* + * If L2 is active, go down the slow path as emulating the guest timer + * expiration likely requires synthesizing a nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + return EXIT_FASTPATH_NONE; + + kvm_lapic_expired_hv_timer(vcpu); + return EXIT_FASTPATH_REENTER_GUEST; } static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - handle_fastpath_preemption_timer(vcpu); + /* + * This non-fastpath handler is reached if and only if the preemption + * timer was being used to emulate a guest timer while L2 is active. + * All other scenarios are supposed to be handled in the fastpath. + */ + WARN_ON_ONCE(!is_guest_mode(vcpu)); + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -6519,7 +6530,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason.full; - vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu); if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { vcpu->run->internal.data[ndata++] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); @@ -7158,13 +7169,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->req_immediate_exit) { + if (force_immediate_exit) { vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); vmx->loaded_vmcs->hv_timer_soft_disabled = false; } else if (vmx->hv_deadline_tsc != -1) { @@ -7217,13 +7228,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, barrier_nospec(); } -static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu, + bool force_immediate_exit) { + /* + * If L2 is active, some VMX preemption timer exits can be handled in + * the fastpath even, all other exits must use the slow path. + */ + if (is_guest_mode(vcpu) && + to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER) + return EXIT_FASTPATH_NONE; + switch (to_vmx(vcpu)->exit_reason.basic) { case EXIT_REASON_MSR_WRITE: return handle_fastpath_set_msr_irqoff(vcpu); case EXIT_REASON_PREEMPTION_TIMER: - return handle_fastpath_preemption_timer(vcpu); + return handle_fastpath_preemption_timer(vcpu, force_immediate_exit); default: return EXIT_FASTPATH_NONE; } @@ -7286,7 +7306,7 @@ out: guest_state_exit_irqoff(); } -static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -7313,7 +7333,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } - trace_kvm_entry(vcpu); + trace_kvm_entry(vcpu, force_immediate_exit); if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -7372,7 +7392,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_passthrough_lbr_msrs(vcpu); if (enable_preemption_timer) - vmx_update_hv_timer(vcpu); + vmx_update_hv_timer(vcpu, force_immediate_exit); + else if (force_immediate_exit) + smp_send_reschedule(vcpu->cpu); kvm_wait_lapic_expire(vcpu); @@ -7436,10 +7458,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); - if (is_guest_mode(vcpu)) - return EXIT_FASTPATH_NONE; - - return vmx_exit_handlers_fastpath(vcpu); + return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit); } static void vmx_vcpu_free(struct kvm_vcpu *vcpu) @@ -7919,11 +7938,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - to_vmx(vcpu)->req_immediate_exit = true; -} - static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, struct x86_instruction_info *info) { @@ -8376,8 +8390,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .check_intercept = vmx_check_intercept, .handle_exit_irqoff = vmx_handle_exit_irqoff, - .request_immediate_exit = vmx_request_immediate_exit, - .sched_in = vmx_sched_in, .cpu_dirty_log_size = PML_ENTITY_NUM, @@ -8637,7 +8649,6 @@ static __init int hardware_setup(void) if (!enable_preemption_timer) { vmx_x86_ops.set_hv_timer = NULL; vmx_x86_ops.cancel_hv_timer = NULL; - vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_caps.supported_mce_cap |= MCG_LMCE_P; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e3b0985bb74a..65786dbe7d60 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -332,8 +332,6 @@ struct vcpu_vmx { unsigned int ple_window; bool ple_window_dirty; - bool req_immediate_exit; - /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ffe580169c93..47d9f03b7778 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) } EXPORT_SYMBOL_GPL(kvm_set_dr); -void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) { size_t size = ARRAY_SIZE(vcpu->arch.db); switch (dr) { case 0 ... 3: - *val = vcpu->arch.db[array_index_nospec(dr, size)]; - break; + return vcpu->arch.db[array_index_nospec(dr, size)]; case 4: case 6: - *val = vcpu->arch.dr6; - break; + return vcpu->arch.dr6; case 5: default: /* 7 */ - *val = vcpu->arch.dr7; - break; + return vcpu->arch.dr7; } } EXPORT_SYMBOL_GPL(kvm_get_dr); @@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, return v * clock->mult; } -static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) +/* + * As with get_kvmclock_base_ns(), this counts from boot time, at the + * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). + */ +static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) return mode; } +/* + * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with + * no boot time offset. + */ +static int do_monotonic(s64 *t, u64 *tsc_timestamp) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + ns = gtod->clock.base_cycles; + ns += vgettsc(>od->clock, tsc_timestamp, &mode); + ns >>= gtod->clock.shift; + ns += ktime_to_ns(gtod->clock.offset); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + *t = ns; + + return mode; +} + static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; @@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) return mode; } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and + * reports the TSC value from which it do so. Returns true if host is + * using TSC based clocksource. + */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, - tsc_timestamp)); + return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using TSC based clocksource */ +/* + * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + */ +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) +{ + /* checked again under seqlock below */ + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) + return false; + + return gtod_is_based_on_tsc(do_monotonic(kernel_ns, + tsc_timestamp)); +} + +/* + * Calculates CLOCK_REALTIME and reports the TSC value from which it did + * so. Returns true if host is using TSC based clocksource. + * + * DO NOT USE this for anything related to migration. You want CLOCK_TAI + * for that. + */ static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, u64 *tsc_timestamp) { @@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, guest_hv_clock->version = ++vcpu->hv_clock.version; - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); @@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) KVM_XEN_HVM_CONFIG_SHARED_INFO | KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | - KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; + KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | + KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; if (sched_info_on()) r |= KVM_XEN_HVM_CONFIG_RUNSTATE | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; @@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) { - if (!vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); /* * Take the srcu lock as memslots will be accessed to check the gfn @@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - unsigned long val; + unsigned int i; memset(dbgregs, 0, sizeof(*dbgregs)); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); - kvm_get_dr(vcpu, 6, &val); - dbgregs->dr6 = val; + + BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + dbgregs->db[i] = vcpu->arch.db[i]; + + dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { + unsigned int i; + if (dbgregs->flags) return -EINVAL; @@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) + vcpu->arch.db[i] = dbgregs->db[i]; + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; @@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) { - kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + return kvm_get_dr(emul_to_vcpu(ctxt), dr); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, @@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); } -static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, - u32 pmc) +static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) { - if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) - return 0; - return -EINVAL; + return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_msr_with_filter = emulator_set_msr_with_filter, .get_msr_with_filter = emulator_get_msr_with_filter, .get_msr = emulator_get_msr, - .check_pmc = emulator_check_pmc, + .check_rdpmc_early = emulator_check_rdpmc_early, .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, @@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_release_pfn_clean(pfn); - /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu->root_role.direct) { - unsigned int indirect_shadow_pages; - - write_lock(&vcpu->kvm->mmu_lock); - indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - write_unlock(&vcpu->kvm->mmu_lock); - - if (indirect_shadow_pages) - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); - - return true; - } - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-enter the - * guest to let CPU execute the instruction. + * If emulation may have been triggered by a write to a shadowed page + * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the + * guest to let the CPU re-execute the instruction in the hope that the + * CPU can cleanly execute the instruction that KVM failed to emulate. */ - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + if (vcpu->kvm->arch.indirect_shadow_pages) + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); /* - * If the access faults on its page table, it can not - * be fixed by unprotecting shadow page and it should - * be reported to userspace. + * If the failed instruction faulted on an access to page tables that + * are used to translate any part of the instruction, KVM can't resolve + * the issue by unprotecting the gfn, as zapping the shadow page will + * result in the instruction taking a !PRESENT page fault and thus put + * the vCPU into an infinite loop of page faults. E.g. KVM will create + * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and + * then zap the SPTE to unprotect the gfn, and then do it all over + * again. Report the error to userspace. */ return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); } @@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) if (unlikely(!r)) return 0; - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); /* * rflags is the old, "raw" value of the flags. The new value has @@ -9235,9 +9279,9 @@ writeback: */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); if (ctxt->is_branch) - kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); kvm_rip_write(vcpu, ctxt->eip); if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); @@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret) *(int *)ret = kvm_x86_check_processor_compatibility(); } -static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) +int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) { u64 host_pat; int r, cpu; + guard(mutex)(&vendor_module_lock); + if (kvm_x86_ops.hardware_enable) { pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name); return -EEXIST; @@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache: kmem_cache_destroy(x86_emulator_cache); return r; } - -int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) -{ - int r; - - mutex_lock(&vendor_module_lock); - r = __kvm_x86_vendor_init(ops); - mutex_unlock(&vendor_module_lock); - - return r; -} EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); void kvm_x86_vendor_exit(void) @@ -10689,12 +10724,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } -void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) -{ - smp_send_reschedule(vcpu->cpu); -} -EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); - /* * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without @@ -10944,10 +10973,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - if (req_immediate_exit) { + if (req_immediate_exit) kvm_make_request(KVM_REQ_EVENT, vcpu); - static_call(kvm_x86_request_immediate_exit)(vcpu); - } fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -10978,7 +11005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); - exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; @@ -12065,7 +12092,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -12076,27 +12103,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) if (r < 0) return r; - if (irqchip_in_kernel(vcpu->kvm)) { - r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); - if (r < 0) - goto fail_mmu_destroy; - - /* - * Defer evaluating inhibits until the vCPU is first run, as - * this vCPU will not get notified of any changes until this - * vCPU is visible to other vCPUs (marked online and added to - * the set of vCPUs). Opportunistically mark APICv active as - * VMX in particularly is highly unlikely to have inhibits. - * Ignore the current per-VM APICv state so that vCPU creation - * is guaranteed to run with a deterministic value, the request - * will ensure the vCPU gets the correct state before VM-Entry. - */ - if (enable_apicv) { - vcpu->arch.apic->apicv_active = true; - kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); - } - } else - static_branch_inc(&kvm_has_noapic_vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; r = -ENOMEM; @@ -12217,8 +12226,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); kvfree(vcpu->arch.cpuid_entries); - if (!lapic_in_kernel(vcpu)) - static_branch_dec(&kvm_has_noapic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -12495,9 +12502,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); -EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -13100,11 +13104,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_apicv_active(vcpu) && - static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) - return true; + return kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); +} - return false; +bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.preempted_in_kernel; } bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) @@ -13127,9 +13133,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return true; - if (vcpu != kvm_get_running_vcpu()) - return vcpu->arch.preempted_in_kernel; - return static_call(kvm_x86_get_cpl)(vcpu) == 0; } @@ -13924,9 +13927,6 @@ module_init(kvm_x86_init); static void __exit kvm_x86_exit(void) { - /* - * If module_init() is implemented, module_exit() must also be - * implemented to allow module unload. - */ + WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu)); } module_exit(kvm_x86_exit); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2f7e19166658..a8b71803777b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); u64 get_kvmclock_ns(struct kvm *kvm); uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm); +bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, @@ -431,12 +432,6 @@ static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm) return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED; } -enum kvm_intr_type { - /* Values are arbitrary, but must be non-zero. */ - KVM_HANDLING_IRQ = 1, - KVM_HANDLING_NMI, -}; - static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu, enum kvm_intr_type intr) { diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 4b4e738c6f1b..f65b35a05d91 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -10,7 +10,7 @@ #include "x86.h" #include "xen.h" #include "hyperv.h" -#include "lapic.h" +#include "irq.h" #include <linux/eventfd.h> #include <linux/kvm_host.h> @@ -24,6 +24,7 @@ #include <xen/interface/sched.h> #include <asm/xen/cpuid.h> +#include <asm/pvclock.h> #include "cpuid.h" #include "trace.h" @@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); -static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) +static int kvm_xen_shared_info_init(struct kvm *kvm) { struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; struct pvclock_wall_clock *wc; - gpa_t gpa = gfn_to_gpa(gfn); u32 *wc_sec_hi; u32 wc_version; u64 wall_nsec; int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (gfn == KVM_XEN_INVALID_GFN) { - kvm_gpc_deactivate(gpc); - goto out; - } + read_lock_irq(&gpc->lock); + while (!kvm_gpc_check(gpc, PAGE_SIZE)) { + read_unlock_irq(&gpc->lock); - do { - ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE); + ret = kvm_gpc_refresh(gpc, PAGE_SIZE); if (ret) goto out; - /* - * This code mirrors kvm_write_wall_clock() except that it writes - * directly through the pfn cache and doesn't mark the page dirty. - */ - wall_nsec = kvm_get_wall_clock_epoch(kvm); - - /* It could be invalid again already, so we need to check */ read_lock_irq(&gpc->lock); + } - if (gpc->valid) - break; - - read_unlock_irq(&gpc->lock); - } while (1); + /* + * This code mirrors kvm_write_wall_clock() except that it writes + * directly through the pfn cache and doesn't mark the page dirty. + */ + wall_nsec = kvm_get_wall_clock_epoch(kvm); /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns) +static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, + bool linux_wa) { + int64_t kernel_now, delta; + uint64_t guest_now; + + /* + * The guest provides the requested timeout in absolute nanoseconds + * of the KVM clock — as *it* sees it, based on the scaled TSC and + * the pvclock information provided by KVM. + * + * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW + * so use CLOCK_MONOTONIC. In the timescales covered by timers, the + * difference won't matter much as there is no cumulative effect. + * + * Calculate the time for some arbitrary point in time around "now" + * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the + * delta between the kvmclock "now" value and the guest's requested + * timeout, apply the "Linux workaround" described below, and add + * the resulting delta to the CLOCK_MONOTONIC "now" value, to get + * the absolute CLOCK_MONOTONIC time at which the timer should + * fire. + */ + if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && + static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + uint64_t host_tsc, guest_tsc; + + if (!IS_ENABLED(CONFIG_64BIT) || + !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) { + /* + * Don't fall back to get_kvmclock_ns() because it's + * broken; it has a systemic error in its results + * because it scales directly from host TSC to + * nanoseconds, and doesn't scale first to guest TSC + * and *then* to nanoseconds as the guest does. + * + * There is a small error introduced here because time + * continues to elapse between the ktime_get() and the + * subsequent rdtsc(). But not the systemic drift due + * to get_kvmclock_ns(). + */ + kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ + host_tsc = rdtsc(); + } + + /* Calculate the guest kvmclock as the guest would do it. */ + guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); + guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock, + guest_tsc); + } else { + /* + * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. + * + * Also if the guest PV clock hasn't been set up yet, as is + * likely to be the case during migration when the vCPU has + * not been run yet. It would be possible to calculate the + * scaling factors properly in that case but there's not much + * point in doing so. The get_kvmclock_ns() drift accumulates + * over time, so it's OK to use it at startup. Besides, on + * migration there's going to be a little bit of skew in the + * precise moment at which timers fire anyway. Often they'll + * be in the "past" by the time the VM is running again after + * migration. + */ + guest_now = get_kvmclock_ns(vcpu->kvm); + kernel_now = ktime_get(); + } + + delta = guest_abs - guest_now; + + /* + * Xen has a 'Linux workaround' in do_set_timer_op() which checks for + * negative absolute timeout values (caused by integer overflow), and + * for values about 13 days in the future (2^50ns) which would be + * caused by jiffies overflow. For those cases, Xen sets the timeout + * 100ms in the future (not *too* soon, since if a guest really did + * set a long timeout on purpose we don't want to keep churning CPU + * time by waking it up). Emulate Xen's workaround when starting the + * timer in response to __HYPERVISOR_set_timer_op. + */ + if (linux_wa && + unlikely((int64_t)guest_abs < 0 || + (delta > 0 && (uint32_t) (delta >> 50) != 0))) { + delta = 100 * NSEC_PER_MSEC; + guest_abs = guest_now + delta; + } + /* * Avoid races with the old timer firing. Checking timer_expires * to avoid calling hrtimer_cancel() will only have false positives @@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ atomic_set(&vcpu->arch.xen.timer_pending, 0); vcpu->arch.xen.timer_expires = guest_abs; - if (delta_ns <= 0) { + if (delta <= 0) xen_timer_callback(&vcpu->arch.xen.timer); - } else { - ktime_t ktime_now = ktime_get(); + else hrtimer_start(&vcpu->arch.xen.timer, - ktime_add_ns(ktime_now, delta_ns), + ktime_add_ns(kernel_now, delta), HRTIMER_MODE_ABS_HARD); - } } static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) @@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) smp_wmb(); } - if (user_len2) + if (user_len2) { + kvm_gpc_mark_dirty_in_slot(gpc2); read_unlock(&gpc2->lock); + } + kvm_gpc_mark_dirty_in_slot(gpc1); read_unlock_irqrestore(&gpc1->lock, flags); - - mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT); - if (user_len2) - mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT); } void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) @@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable); } -static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) +void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) { struct kvm_lapic_irq irq = { }; - int r; irq.dest_id = v->vcpu_id; irq.vector = v->arch.xen.upcall_vector; @@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) irq.delivery_mode = APIC_DM_FIXED; irq.level = 1; - /* The fast version will always work for physical unicast */ - WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL)); + kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL); } /* @@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) : "0" (evtchn_pending_sel32)); WRITE_ONCE(vi->evtchn_upcall_pending, 1); } + + kvm_gpc_mark_dirty_in_slot(gpc); read_unlock_irqrestore(&gpc->lock, flags); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) kvm_xen_inject_vcpu_vector(v); - - mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) } else { mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. Whilst it's not necessary to do this + * unless the mode is actually changed, it does no harm + * to make the call anyway. + */ + r = kvm->arch.xen.shinfo_cache.active ? + kvm_xen_shared_info_init(kvm) : 0; mutex_unlock(&kvm->arch.xen.xen_lock); - r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { + int idx; + mutex_lock(&kvm->arch.xen.xen_lock); - r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); + + idx = srcu_read_lock(&kvm->srcu); + + if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { + gfn_t gfn = data->u.shared_info.gfn; + + if (gfn == KVM_XEN_INVALID_GFN) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache, + gfn_to_gpa(gfn), PAGE_SIZE); + } + } else { + void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); + + if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { + r = -EINVAL; + } else if (!hva) { + kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache); + r = 0; + } else { + r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache, + (unsigned long)hva, PAGE_SIZE); + } + } + + srcu_read_unlock(&kvm->srcu, idx); + + if (!r && kvm->arch.xen.shinfo_cache.active) + r = kvm_xen_shared_info_init(kvm); + mutex_unlock(&kvm->arch.xen.xen_lock); break; - + } case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; @@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_cache.active) + if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache)) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; r = 0; break; + case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: + if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache)) + data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; + else + data->u.shared_info.hva = 0; + r = 0; + break; + case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: data->u.vector = kvm->arch.xen.upcall_vector; r = 0; @@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: /* No compat necessary here. */ BUILD_BUG_ON(sizeof(struct vcpu_info) != sizeof(struct compat_vcpu_info)); BUILD_BUG_ON(offsetof(struct vcpu_info, time) != offsetof(struct compat_vcpu_info, time)); - if (data->u.gpa == KVM_XEN_INVALID_GPA) { - kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); - r = 0; - break; + if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, + data->u.gpa, sizeof(struct vcpu_info)); + } else { + if (data->u.hva == 0) { + kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); + r = 0; + break; + } + + r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache, + data->u.hva, sizeof(struct vcpu_info)); } - r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache, - data->u.gpa, sizeof(struct vcpu_info)); if (!r) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) /* Start the timer if the new value has a valid vector+expiry. */ if (data->u.timer.port && data->u.timer.expires_ns) - kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, - data->u.timer.expires_ns - - get_kvmclock_ns(vcpu->kvm)); + kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false); r = 0; break; @@ -977,13 +1110,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: - if (vcpu->arch.xen.vcpu_info_cache.active) + if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache)) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: + if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache)) + data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva; + else + data->u.hva = 0; + r = 0; + break; + case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; @@ -1093,9 +1234,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; bool lm = is_long_mode(vcpu); + int r = 0; + + mutex_lock(&kvm->arch.xen.xen_lock); + if (kvm->arch.xen.long_mode != lm) { + kvm->arch.xen.long_mode = lm; + + /* + * Re-initialize shared_info to put the wallclock in the + * correct place. + */ + if (kvm->arch.xen.shinfo_cache.active && + kvm_xen_shared_info_init(kvm)) + r = 1; + } + mutex_unlock(&kvm->arch.xen.xen_lock); - /* Latch long_mode for shared_info pages etc. */ - vcpu->kvm->arch.xen.long_mode = lm; + if (r) + return r; /* * If Xen hypercall intercept is enabled, fill the hypercall @@ -1396,7 +1552,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, { struct vcpu_set_singleshot_timer oneshot; struct x86_exception e; - s64 delta; if (!kvm_xen_timer_enabled(vcpu)) return false; @@ -1430,9 +1585,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, return true; } - /* A delta <= 0 results in an immediate callback, which is what we want */ - delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm); - kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta); + kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false); *r = 0; return true; @@ -1455,29 +1608,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, if (!kvm_xen_timer_enabled(vcpu)) return false; - if (timeout) { - uint64_t guest_now = get_kvmclock_ns(vcpu->kvm); - int64_t delta = timeout - guest_now; - - /* Xen has a 'Linux workaround' in do_set_timer_op() which - * checks for negative absolute timeout values (caused by - * integer overflow), and for values about 13 days in the - * future (2^50ns) which would be caused by jiffies - * overflow. For those cases, it sets the timeout 100ms in - * the future (not *too* soon, since if a guest really did - * set a long timeout on purpose we don't want to keep - * churning CPU time by waking it up). - */ - if (unlikely((int64_t)timeout < 0 || - (delta > 0 && (uint32_t) (delta >> 50) != 0))) { - delta = 100 * NSEC_PER_MSEC; - timeout = guest_now + delta; - } - - kvm_xen_start_timer(vcpu, timeout, delta); - } else { + if (timeout) + kvm_xen_start_timer(vcpu, timeout, true); + else kvm_xen_stop_timer(vcpu); - } *r = 0; return true; @@ -1621,9 +1755,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); } - if (!vcpu->arch.xen.vcpu_info_cache.active) - return -EINVAL; - if (xe->port >= max_evtchn_port(kvm)) return -EINVAL; @@ -1731,8 +1862,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) mm_borrowed = true; } - mutex_lock(&kvm->arch.xen.xen_lock); - /* * It is theoretically possible for the page to be unmapped * and the MMU notifier to invalidate the shared_info before @@ -1760,8 +1889,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->arch.xen.xen_lock); - if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -2109,14 +2236,10 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); - kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL, - KVM_HOST_USES_PFN); - kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL, - KVM_HOST_USES_PFN); - kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL, - KVM_HOST_USES_PFN); - kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL, - KVM_HOST_USES_PFN); + kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm); + kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm); } void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) @@ -2159,7 +2282,7 @@ void kvm_xen_init_vm(struct kvm *kvm) { mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); - kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); + kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm); } void kvm_xen_destroy_vm(struct kvm *kvm) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index f8f1fe22d090..f5841d9000ae 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled; int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu); void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu); +void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu); int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data); int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); @@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm, const struct kvm_irq_routing_entry *ue); void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu); +static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) +{ + /* + * The local APIC is being enabled. If the per-vCPU upcall vector is + * set and the vCPU's evtchn_upcall_pending flag is set, inject the + * interrupt. + */ + if (static_branch_unlikely(&kvm_xen_enabled.key) && + vcpu->arch.xen.vcpu_info_cache.active && + vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu)) + kvm_xen_inject_vcpu_vector(vcpu); +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && @@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu) +{ +} + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return false; |