summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/idtentry.h2
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h4
-rw-r--r--arch/x86/include/asm/kvm_host.h28
-rw-r--r--arch/x86/include/asm/svm.h8
-rw-r--r--arch/x86/include/asm/vmxfeatures.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h285
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h2
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c2
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kvm/Kconfig4
-rw-r--r--arch/x86/kvm/debugfs.c3
-rw-r--r--arch/x86/kvm/emulate.c47
-rw-r--r--arch/x86/kvm/kvm_emulate.h4
-rw-r--r--arch/x86/kvm/lapic.c32
-rw-r--r--arch/x86/kvm/mmu/mmu.c37
-rw-r--r--arch/x86/kvm/mmu/page_track.c68
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c124
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h2
-rw-r--r--arch/x86/kvm/pmu.c163
-rw-r--r--arch/x86/kvm/pmu.h57
-rw-r--r--arch/x86/kvm/smm.c15
-rw-r--r--arch/x86/kvm/svm/pmu.c22
-rw-r--r--arch/x86/kvm/svm/svm.c25
-rw-r--r--arch/x86/kvm/trace.h9
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c220
-rw-r--r--arch/x86/kvm/vmx/vmx.c157
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c228
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/kvm/xen.c315
-rw-r--r--arch/x86/kvm/xen.h18
38 files changed, 1253 insertions, 656 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index dd79e15a3c93..7aed87cbf386 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -245,7 +245,6 @@ config X86
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_KRETPROBES
select HAVE_RETHOOK
- select HAVE_KVM
select HAVE_LIVEPATCH if X86_64
select HAVE_MIXED_BREAKPOINTS_REGS
select HAVE_MOD_ARCH_SPECIFIC
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 66837b8c67f1..fbc7722b87d1 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -15,7 +15,7 @@ typedef struct {
unsigned int irq_spurious_count;
unsigned int icr_read_retry_count;
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
unsigned int kvm_posted_intr_ipis;
unsigned int kvm_posted_intr_wakeup_ipis;
unsigned int kvm_posted_intr_nested_ipis;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 47d4c04d103d..749c7411d2f1 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -741,7 +741,7 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work);
# endif
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi);
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi);
DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 836c170d3087..194dfff84cb1 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -29,7 +29,7 @@ struct irq_desc;
extern void fixup_irqs(void);
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3a19904c2db6..d18bfb238f66 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -84,11 +84,9 @@
#define HYPERVISOR_CALLBACK_VECTOR 0xf3
/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
#define POSTED_INTR_VECTOR 0xf2
#define POSTED_INTR_WAKEUP_VECTOR 0xf1
#define POSTED_INTR_NESTED_VECTOR 0xf0
-#endif
#define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index ab24ce207988..110d7f29ca9a 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -103,7 +103,6 @@ KVM_X86_OP(write_tsc_multiplier)
KVM_X86_OP(get_exit_info)
KVM_X86_OP(check_intercept)
KVM_X86_OP(handle_exit_irqoff)
-KVM_X86_OP(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_OPTIONAL(update_cpu_dirty_logging)
KVM_X86_OP_OPTIONAL(vcpu_blocking)
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
index 058bc636356a..f852b13aeefe 100644
--- a/arch/x86/include/asm/kvm-x86-pmu-ops.h
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -12,11 +12,9 @@ BUILD_BUG_ON(1)
* a NULL definition, for example if "static_call_cond()" will be used
* at the call sites.
*/
-KVM_X86_PMU_OP(hw_event_available)
-KVM_X86_PMU_OP(pmc_idx_to_pmc)
KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
KVM_X86_PMU_OP(msr_idx_to_pmc)
-KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP_OPTIONAL(check_rdpmc_early)
KVM_X86_PMU_OP(is_valid_msr)
KVM_X86_PMU_OP(get_msr)
KVM_X86_PMU_OP(set_msr)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18cbde14cf81..16e07a2eee19 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -536,6 +536,7 @@ struct kvm_pmc {
#define KVM_PMC_MAX_FIXED 3
#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
+
struct kvm_pmu {
u8 version;
unsigned nr_arch_gp_counters;
@@ -1468,6 +1469,15 @@ struct kvm_arch {
*/
bool shadow_root_allocated;
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * If set, the VM has (or had) an external write tracking user, and
+ * thus all write tracking metadata has been allocated, even if KVM
+ * itself isn't using write tracking.
+ */
+ bool external_write_tracking_enabled;
+#endif
+
#if IS_ENABLED(CONFIG_HYPERV)
hpa_t hv_root_tdp;
spinlock_t hv_root_tdp_lock;
@@ -1665,7 +1675,8 @@ struct kvm_x86_ops {
void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
- enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu);
+ enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
@@ -1733,8 +1744,6 @@ struct kvm_x86_ops {
struct x86_exception *exception);
void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
- void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
-
void (*sched_in)(struct kvm_vcpu *vcpu, int cpu);
/*
@@ -1882,8 +1891,16 @@ static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
}
#endif /* CONFIG_HYPERV */
+enum kvm_intr_type {
+ /* Values are arbitrary, but must be non-zero. */
+ KVM_HANDLING_IRQ = 1,
+ KVM_HANDLING_NMI,
+};
+
+/* Enable perf NMI and timer modes to work, and minimise false positives. */
#define kvm_arch_pmi_in_guest(vcpu) \
- ((vcpu) && (vcpu)->arch.handling_intr_from_guest)
+ ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
+ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
void __init kvm_mmu_x86_module_init(void);
int kvm_mmu_vendor_module_init(void);
@@ -2048,7 +2065,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
@@ -2241,7 +2258,6 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
u32 size);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 87a7b917d30e..728c98175b9c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -358,10 +358,10 @@ struct sev_es_save_area {
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
- u64 vmpl0_ssp;
- u64 vmpl1_ssp;
- u64 vmpl2_ssp;
- u64 vmpl3_ssp;
+ u64 pl0_ssp;
+ u64 pl1_ssp;
+ u64 pl2_ssp;
+ u64 pl3_ssp;
u64 u_cet;
u8 reserved_0xc8[2];
u8 vmpl;
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
index c6a7eed03914..266daf5b5b84 100644
--- a/arch/x86/include/asm/vmxfeatures.h
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -25,6 +25,7 @@
#define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */
#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */
#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */
+#define VMX_FEATURE_EPT_5LEVEL ( 0*32+ 20) /* 5-level EPT paging */
/* Aggregated APIC features 24-27 */
#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a448d0964fc0..ad29984d5e39 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -7,6 +7,8 @@
*
*/
+#include <linux/const.h>
+#include <linux/bits.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/stddef.h>
@@ -40,7 +42,6 @@
#define __KVM_HAVE_IRQ_LINE
#define __KVM_HAVE_MSI
#define __KVM_HAVE_USER_NMI
-#define __KVM_HAVE_GUEST_DEBUG
#define __KVM_HAVE_MSIX
#define __KVM_HAVE_MCE
#define __KVM_HAVE_PIT_STATE2
@@ -49,7 +50,6 @@
#define __KVM_HAVE_DEBUGREGS
#define __KVM_HAVE_XSAVE
#define __KVM_HAVE_XCRS
-#define __KVM_HAVE_READONLY_MEM
/* Architectural interrupt line count. */
#define KVM_NR_INTERRUPTS 256
@@ -526,9 +526,278 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
-#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS _BITUL(0)
#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+/* for KVM_CAP_MCE */
+struct kvm_x86_mce {
+ __u64 status;
+ __u64 addr;
+ __u64 misc;
+ __u64 mcg_status;
+ __u8 bank;
+ __u8 pad1[7];
+ __u64 pad2[3];
+};
+
+/* for KVM_CAP_XEN_HVM */
+#define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0)
+#define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4)
+#define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5)
+#define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6)
+#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7)
+#define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8)
+
+struct kvm_xen_hvm_config {
+ __u32 flags;
+ __u32 msr;
+ __u64 blob_addr_32;
+ __u64 blob_addr_64;
+ __u8 blob_size_32;
+ __u8 blob_size_64;
+ __u8 pad2[30];
+};
+
+struct kvm_xen_hvm_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u8 long_mode;
+ __u8 vector;
+ __u8 runstate_update_flag;
+ union {
+ __u64 gfn;
+#define KVM_XEN_INVALID_GFN ((__u64)-1)
+ __u64 hva;
+ } shared_info;
+ struct {
+ __u32 send_port;
+ __u32 type; /* EVTCHNSTAT_ipi / EVTCHNSTAT_interdomain */
+ __u32 flags;
+#define KVM_XEN_EVTCHN_DEASSIGN (1 << 0)
+#define KVM_XEN_EVTCHN_UPDATE (1 << 1)
+#define KVM_XEN_EVTCHN_RESET (1 << 2)
+ /*
+ * Events sent by the guest are either looped back to
+ * the guest itself (potentially on a different port#)
+ * or signalled via an eventfd.
+ */
+ union {
+ struct {
+ __u32 port;
+ __u32 vcpu;
+ __u32 priority;
+ } port;
+ struct {
+ __u32 port; /* Zero for eventfd */
+ __s32 fd;
+ } eventfd;
+ __u32 padding[4];
+ } deliver;
+ } evtchn;
+ __u32 xen_version;
+ __u64 pad[8];
+ } u;
+};
+
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_ATTR_TYPE_LONG_MODE 0x0
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO 0x1
+#define KVM_XEN_ATTR_TYPE_UPCALL_VECTOR 0x2
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_ATTR_TYPE_EVTCHN 0x3
+#define KVM_XEN_ATTR_TYPE_XEN_VERSION 0x4
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG */
+#define KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA 0x6
+
+struct kvm_xen_vcpu_attr {
+ __u16 type;
+ __u16 pad[3];
+ union {
+ __u64 gpa;
+#define KVM_XEN_INVALID_GPA ((__u64)-1)
+ __u64 hva;
+ __u64 pad[8];
+ struct {
+ __u64 state;
+ __u64 state_entry_time;
+ __u64 time_running;
+ __u64 time_runnable;
+ __u64 time_blocked;
+ __u64 time_offline;
+ } runstate;
+ __u32 vcpu_id;
+ struct {
+ __u32 port;
+ __u32 priority;
+ __u64 expires_ns;
+ } timer;
+ __u8 vector;
+ } u;
+};
+
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4
+#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_EVTCHN_SEND */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID 0x6
+#define KVM_XEN_VCPU_ATTR_TYPE_TIMER 0x7
+#define KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR 0x8
+/* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA */
+#define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA 0x9
+
+/* Secure Encrypted Virtualization command */
+enum sev_cmd_id {
+ /* Guest initialization commands */
+ KVM_SEV_INIT = 0,
+ KVM_SEV_ES_INIT,
+ /* Guest launch commands */
+ KVM_SEV_LAUNCH_START,
+ KVM_SEV_LAUNCH_UPDATE_DATA,
+ KVM_SEV_LAUNCH_UPDATE_VMSA,
+ KVM_SEV_LAUNCH_SECRET,
+ KVM_SEV_LAUNCH_MEASURE,
+ KVM_SEV_LAUNCH_FINISH,
+ /* Guest migration commands (outgoing) */
+ KVM_SEV_SEND_START,
+ KVM_SEV_SEND_UPDATE_DATA,
+ KVM_SEV_SEND_UPDATE_VMSA,
+ KVM_SEV_SEND_FINISH,
+ /* Guest migration commands (incoming) */
+ KVM_SEV_RECEIVE_START,
+ KVM_SEV_RECEIVE_UPDATE_DATA,
+ KVM_SEV_RECEIVE_UPDATE_VMSA,
+ KVM_SEV_RECEIVE_FINISH,
+ /* Guest status and debug commands */
+ KVM_SEV_GUEST_STATUS,
+ KVM_SEV_DBG_DECRYPT,
+ KVM_SEV_DBG_ENCRYPT,
+ /* Guest certificates commands */
+ KVM_SEV_CERT_EXPORT,
+ /* Attestation report */
+ KVM_SEV_GET_ATTESTATION_REPORT,
+ /* Guest Migration Extension */
+ KVM_SEV_SEND_CANCEL,
+
+ KVM_SEV_NR_MAX,
+};
+
+struct kvm_sev_cmd {
+ __u32 id;
+ __u64 data;
+ __u32 error;
+ __u32 sev_fd;
+};
+
+struct kvm_sev_launch_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 dh_uaddr;
+ __u32 dh_len;
+ __u64 session_uaddr;
+ __u32 session_len;
+};
+
+struct kvm_sev_launch_update_data {
+ __u64 uaddr;
+ __u32 len;
+};
+
+
+struct kvm_sev_launch_secret {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+};
+
+struct kvm_sev_launch_measure {
+ __u64 uaddr;
+ __u32 len;
+};
+
+struct kvm_sev_guest_status {
+ __u32 handle;
+ __u32 policy;
+ __u32 state;
+};
+
+struct kvm_sev_dbg {
+ __u64 src_uaddr;
+ __u64 dst_uaddr;
+ __u32 len;
+};
+
+struct kvm_sev_attestation_report {
+ __u8 mnonce[16];
+ __u64 uaddr;
+ __u32 len;
+};
+
+struct kvm_sev_send_start {
+ __u32 policy;
+ __u64 pdh_cert_uaddr;
+ __u32 pdh_cert_len;
+ __u64 plat_certs_uaddr;
+ __u32 plat_certs_len;
+ __u64 amd_certs_uaddr;
+ __u32 amd_certs_len;
+ __u64 session_uaddr;
+ __u32 session_len;
+};
+
+struct kvm_sev_send_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+};
+
+struct kvm_sev_receive_start {
+ __u32 handle;
+ __u32 policy;
+ __u64 pdh_uaddr;
+ __u32 pdh_len;
+ __u64 session_uaddr;
+ __u32 session_len;
+};
+
+struct kvm_sev_receive_update_data {
+ __u64 hdr_uaddr;
+ __u32 hdr_len;
+ __u64 guest_uaddr;
+ __u32 guest_len;
+ __u64 trans_uaddr;
+ __u32 trans_len;
+};
+
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
+struct kvm_hyperv_eventfd {
+ __u32 conn_id;
+ __s32 fd;
+ __u32 flags;
+ __u32 padding[3];
+};
+
+#define KVM_HYPERV_CONN_ID_MASK 0x00ffffff
+#define KVM_HYPERV_EVENTFD_DEASSIGN (1 << 0)
+
/*
* Masked event layout.
* Bits Description
@@ -549,10 +818,10 @@ struct kvm_pmu_event_filter {
((__u64)(!!(exclude)) << 55))
#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
- (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56))
-#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8))
-#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55))
+ (__GENMASK_ULL(7, 0) | __GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (__GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (__GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (_BITULL(55))
#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
@@ -560,7 +829,7 @@ struct kvm_pmu_event_filter {
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
/* x86-specific KVM_EXIT_HYPERCALL flags. */
-#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
+#define KVM_EXIT_HYPERCALL_LONG_MODE _BITULL(0)
#define KVM_X86_DEFAULT_VM 0
#define KVM_X86_SW_PROTECTED_VM 1
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6e64b27b2c1e..6bc3456a8ebf 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -92,7 +92,7 @@ struct kvm_clock_pairing {
#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3)
/* MSR_KVM_ASYNC_PF_INT */
-#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0)
+#define KVM_ASYNC_PF_VEC_MASK __GENMASK(7, 0)
/* MSR_KVM_MIGRATION_CONTROL */
#define KVM_MIGRATION_READY (1 << 0)
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 03851240c3e3..1640ae76548f 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -72,6 +72,8 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
if (ept & VMX_EPT_1GB_PAGE_BIT)
c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+ if (ept & VMX_EPT_PAGE_WALK_5_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_5LEVEL);
/* Synthetic APIC features that are aggregates of multiple features. */
if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 0cd53fa8c65d..fc37c8d83daf 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -153,7 +153,7 @@ static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_X86_LOCAL_APIC
INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
-# ifdef CONFIG_HAVE_KVM
+# if IS_ENABLED(CONFIG_KVM)
INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 11761c124545..35fde0107901 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -164,7 +164,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
seq_printf(p, "%*s: ", prec, "PIN");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
@@ -290,7 +290,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
}
#endif
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
static void dummy_handler(void) {}
static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65ed14b6540b..8c3032a96caf 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -7,7 +7,6 @@ source "virt/kvm/Kconfig"
menuconfig VIRTUALIZATION
bool "Virtualization"
- depends on HAVE_KVM || X86
default y
help
Say Y here to get to see options for using your Linux host to run other
@@ -20,7 +19,6 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM
depends on HIGH_RES_TIMERS
depends on X86_LOCAL_APIC
select KVM_COMMON
@@ -29,9 +27,9 @@ config KVM
select HAVE_KVM_PFNCACHE
select HAVE_KVM_DIRTY_RING_TSO
select HAVE_KVM_DIRTY_RING_ACQ_REL
- select IRQ_BYPASS_MANAGER
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
+ select HAVE_KVM_READONLY_MEM
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 95ea1a1f7403..999227fc7c66 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -189,9 +189,8 @@ static const struct file_operations mmu_rmaps_stat_fops = {
.release = kvm_mmu_rmaps_stat_release,
};
-int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
&mmu_rmaps_stat_fops);
- return 0;
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e223043ef5b2..5d4c86133453 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1820,22 +1820,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
return X86EMUL_CONTINUE;
}
-static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
+static int emulate_push(struct x86_emulate_ctxt *ctxt, const void *data, int len)
{
struct segmented_address addr;
- rsp_increment(ctxt, -bytes);
+ rsp_increment(ctxt, -len);
addr.ea = reg_read(ctxt, VCPU_REGS_RSP) & stack_mask(ctxt);
addr.seg = VCPU_SREG_SS;
- return segmented_write(ctxt, addr, data, bytes);
+ return segmented_write(ctxt, addr, data, len);
}
static int em_push(struct x86_emulate_ctxt *ctxt)
{
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
+ return emulate_push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1863,7 +1863,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
void *dest, int len)
{
int rc;
- unsigned long val, change_mask;
+ unsigned long val = 0;
+ unsigned long change_mask;
int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
int cpl = ctxt->ops->cpl(ctxt);
@@ -1920,7 +1921,7 @@ static int em_enter(struct x86_emulate_ctxt *ctxt)
return X86EMUL_UNHANDLEABLE;
rbp = reg_read(ctxt, VCPU_REGS_RBP);
- rc = push(ctxt, &rbp, stack_size(ctxt));
+ rc = emulate_push(ctxt, &rbp, stack_size(ctxt));
if (rc != X86EMUL_CONTINUE)
return rc;
assign_masked(reg_rmw(ctxt, VCPU_REGS_RBP), reg_read(ctxt, VCPU_REGS_RSP),
@@ -1954,7 +1955,7 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
- unsigned long selector;
+ unsigned long selector = 0;
int rc;
rc = emulate_pop(ctxt, &selector, 2);
@@ -2000,7 +2001,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RDI;
- u32 val;
+ u32 val = 0;
while (reg >= VCPU_REGS_RAX) {
if (reg == VCPU_REGS_RSP) {
@@ -2229,7 +2230,7 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2241,7 +2242,8 @@ static int em_ret(struct x86_emulate_ctxt *ctxt)
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip, cs;
+ unsigned long eip = 0;
+ unsigned long cs = 0;
int cpl = ctxt->ops->cpl(ctxt);
struct desc_struct new_desc;
@@ -3011,7 +3013,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
- ops->get_dr(ctxt, 7, &dr7);
+ dr7 = ops->get_dr(ctxt, 7);
ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
return ret;
@@ -3184,7 +3186,7 @@ fail:
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long eip;
+ unsigned long eip = 0;
rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -3866,15 +3868,6 @@ static int check_cr_access(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
-{
- unsigned long dr7;
-
- ctxt->ops->get_dr(ctxt, 7, &dr7);
-
- return dr7 & DR7_GD;
-}
-
static int check_dr_read(struct x86_emulate_ctxt *ctxt)
{
int dr = ctxt->modrm_reg;
@@ -3887,10 +3880,10 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (check_dr7_gd(ctxt)) {
+ if (ctxt->ops->get_dr(ctxt, 7) & DR7_GD) {
ulong dr6;
- ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 = ctxt->ops->get_dr(ctxt, 6);
dr6 &= ~DR_TRAP_BITS;
dr6 |= DR6_BD | DR6_ACTIVE_LOW;
ctxt->ops->set_dr(ctxt, 6, dr6);
@@ -3962,7 +3955,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
* protected mode.
*/
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- ctxt->ops->check_pmc(ctxt, rcx))
+ ctxt->ops->check_rdpmc_early(ctxt, rcx))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -4505,11 +4498,11 @@ static const struct instr_dual instr_dual_0f_38_f1 = {
};
static const struct gprefix three_byte_0f_38_f0 = {
- ID(0, &instr_dual_0f_38_f0), N, N, N
+ ID(0, &instr_dual_0f_38_f0), ID(0, &instr_dual_0f_38_f0), N, N
};
static const struct gprefix three_byte_0f_38_f1 = {
- ID(0, &instr_dual_0f_38_f1), N, N, N
+ ID(0, &instr_dual_0f_38_f1), ID(0, &instr_dual_0f_38_f1), N, N
};
/*
@@ -5449,7 +5442,7 @@ twobyte_insn:
ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
break;
case 0x21: /* mov from dr to reg */
- ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
+ ctxt->dst.val = ops->get_dr(ctxt, ctxt->modrm_reg);
break;
case 0x40 ... 0x4f: /* cmov */
if (test_cc(ctxt->b, ctxt->eflags))
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index e6d149825169..5382646162a3 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -203,12 +203,12 @@ struct x86_emulate_ops {
ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
int (*cpl)(struct x86_emulate_ctxt *ctxt);
- void (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
+ ulong (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr);
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr_with_filter)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
- int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
+ int (*check_rdpmc_early)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1edf93ee3395..cf37586f0466 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "ioapic.h"
#include "trace.h"
#include "x86.h"
+#include "xen.h"
#include "cpuid.h"
#include "hyperv.h"
#include "smm.h"
@@ -124,6 +125,9 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
+EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
+
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_hw_disabled, HZ);
__read_mostly DEFINE_STATIC_KEY_DEFERRED_FALSE(apic_sw_disabled, HZ);
@@ -499,8 +503,10 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
/* Check if there are APF page ready requests pending */
- if (enabled)
+ if (enabled) {
kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
+ kvm_xen_sw_enable_lapic(apic->vcpu);
+ }
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -2466,8 +2472,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!vcpu->arch.apic)
+ if (!vcpu->arch.apic) {
+ static_branch_dec(&kvm_has_noapic_vcpu);
return;
+ }
hrtimer_cancel(&apic->lapic_timer.timer);
@@ -2809,6 +2817,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
ASSERT(vcpu != NULL);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ static_branch_inc(&kvm_has_noapic_vcpu);
+ return 0;
+ }
+
apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
@@ -2847,6 +2860,21 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
+ /*
+ * Defer evaluating inhibits until the vCPU is first run, as this vCPU
+ * will not get notified of any changes until this vCPU is visible to
+ * other vCPUs (marked online and added to the set of vCPUs).
+ *
+ * Opportunistically mark APICv active as VMX in particularly is highly
+ * unlikely to have inhibits. Ignore the current per-VM APICv state so
+ * that vCPU creation is guaranteed to run with a deterministic value,
+ * the request will ensure the vCPU gets the correct state before VM-Entry.
+ */
+ if (enable_apicv) {
+ apic->apicv_active = true;
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+ }
+
return 0;
nomem_free_apic:
kfree(apic);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7c9fce512625..992e651540e8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3575,10 +3575,14 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
if (WARN_ON_ONCE(!sp))
return;
- if (is_tdp_mmu_page(sp))
+ if (is_tdp_mmu_page(sp)) {
+ lockdep_assert_held_read(&kvm->mmu_lock);
kvm_tdp_mmu_put_root(kvm, sp);
- else if (!--sp->root_count && sp->role.invalid)
- kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ } else {
+ lockdep_assert_held_write(&kvm->mmu_lock);
+ if (!--sp->root_count && sp->role.invalid)
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+ }
*root_hpa = INVALID_PAGE;
}
@@ -3587,6 +3591,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free)
{
+ bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
int i;
LIST_HEAD(invalid_list);
bool free_active_root;
@@ -3609,7 +3614,10 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
return;
}
- write_lock(&kvm->mmu_lock);
+ if (is_tdp_mmu)
+ read_lock(&kvm->mmu_lock);
+ else
+ write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
@@ -3635,8 +3643,13 @@ void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
mmu->root.pgd = 0;
}
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
- write_unlock(&kvm->mmu_lock);
+ if (is_tdp_mmu) {
+ read_unlock(&kvm->mmu_lock);
+ WARN_ON_ONCE(!list_empty(&invalid_list));
+ } else {
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ write_unlock(&kvm->mmu_lock);
+ }
}
EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
@@ -3693,15 +3706,15 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
unsigned i;
int r;
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_alloc_root(vcpu);
+
write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
if (r < 0)
goto out_unlock;
- if (tdp_mmu_enabled) {
- root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
- mmu->root.hpa = root;
- } else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (shadow_root_level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level);
mmu->root.hpa = root;
} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
@@ -7039,9 +7052,7 @@ int kvm_mmu_vendor_module_init(void)
kvm_mmu_reset_all_pte_masks();
- pte_list_desc_cache = kmem_cache_create("pte_list_desc",
- sizeof(struct pte_list_desc),
- 0, SLAB_ACCOUNT, NULL);
+ pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
if (!pte_list_desc_cache)
goto out;
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index c87da11f3a04..f6448284c18e 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -20,10 +20,23 @@
#include "mmu_internal.h"
#include "page_track.h"
+static bool kvm_external_write_tracking_enabled(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
+ /*
+ * Read external_write_tracking_enabled before related pointers. Pairs
+ * with the smp_store_release in kvm_page_track_write_tracking_enable().
+ */
+ return smp_load_acquire(&kvm->arch.external_write_tracking_enabled);
+#else
+ return false;
+#endif
+}
+
bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
{
- return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
- !tdp_enabled || kvm_shadow_root_allocated(kvm);
+ return kvm_external_write_tracking_enabled(kvm) ||
+ kvm_shadow_root_allocated(kvm) || !tdp_enabled;
}
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
@@ -153,6 +166,50 @@ int kvm_page_track_init(struct kvm *kvm)
return init_srcu_struct(&head->track_srcu);
}
+static int kvm_enable_external_write_tracking(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i, bkt;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Check for *any* write tracking user (not just external users) under
+ * lock. This avoids unnecessary work, e.g. if KVM itself is using
+ * write tracking, or if two external users raced when registering.
+ */
+ if (kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, bkt, slots) {
+ /*
+ * Intentionally do NOT free allocations on failure to
+ * avoid having to track which allocations were made
+ * now versus when the memslot was created. The
+ * metadata is guaranteed to be freed when the slot is
+ * freed, and will be kept/used if userspace retries
+ * the failed ioctl() instead of killing the VM.
+ */
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+out_success:
+ /*
+ * Ensure that external_write_tracking_enabled becomes true strictly
+ * after all the related pointers are set.
+ */
+ smp_store_release(&kvm->arch.external_write_tracking_enabled, true);
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
/*
* register the notifier so that event interception for the tracked guest
* pages can be received.
@@ -161,10 +218,17 @@ int kvm_page_track_register_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n)
{
struct kvm_page_track_notifier_head *head;
+ int r;
if (!kvm || kvm->mm != current->mm)
return -ESRCH;
+ if (!kvm_external_write_tracking_enabled(kvm)) {
+ r = kvm_enable_external_write_tracking(kvm);
+ if (r)
+ return r;
+ }
+
kvm_get_kvm(kvm);
head = &kvm->arch.track_notifier_head;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 6ae19b4ee5b1..d078157e62aa 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -149,11 +149,11 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* If shared is set, this function is operating under the MMU lock in read
* mode.
*/
-#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\
- for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
- ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
- _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
- if (kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) \
+ for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \
+ ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
+ _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \
+ if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
} else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
@@ -171,12 +171,19 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
* Holding mmu_lock for write obviates the need for RCU protection as the list
* is guaranteed to be stable.
*/
-#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
- list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
- if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
- kvm_mmu_page_as_id(_root) != _as_id) { \
+#define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) \
+ list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
+ if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
+ ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
+ ((_only_valid) && (_root)->role.invalid))) { \
} else
+#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
+
+#define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
+ __for_each_tdp_mmu_root(_kvm, _root, _as_id, true)
+
static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -216,22 +223,41 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
}
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ union kvm_mmu_page_role role = mmu->root_role;
+ int as_id = kvm_mmu_role_as_id(role);
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
- lockdep_assert_held_write(&kvm->mmu_lock);
+ /*
+ * Check for an existing root before acquiring the pages lock to avoid
+ * unnecessary serialization if multiple vCPUs are loading a new root.
+ * E.g. when bringing up secondary vCPUs, KVM will already have created
+ * a valid root on behalf of the primary vCPU.
+ */
+ read_lock(&kvm->mmu_lock);
+
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
+ if (root->role.word == role.word)
+ goto out_read_unlock;
+ }
+
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
/*
- * Check for an existing root before allocating a new one. Note, the
- * role check prevents consuming an invalid root.
+ * Recheck for an existing root after acquiring the pages lock, another
+ * vCPU may have raced ahead and created a new usable root. Manually
+ * walk the list of roots as the standard macros assume that the pages
+ * lock is *not* held. WARN if grabbing a reference to a usable root
+ * fails, as the last reference to a root can only be put *after* the
+ * root has been invalidated, which requires holding mmu_lock for write.
*/
- for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
+ list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
if (root->role.word == role.word &&
- kvm_tdp_mmu_get_root(root))
- goto out;
+ !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
+ goto out_spin_unlock;
}
root = tdp_mmu_alloc_sp(vcpu);
@@ -245,13 +271,20 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
* is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
*/
refcount_set(&root->tdp_mmu_root_count, 2);
-
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
-out:
- return __pa(root->spt);
+out_spin_unlock:
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+out_read_unlock:
+ read_unlock(&kvm->mmu_lock);
+ /*
+ * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
+ * and actually consuming the root if it's invalidated after dropping
+ * mmu_lock, and the root can't be freed as this vCPU holds a reference.
+ */
+ mmu->root.hpa = __pa(root->spt);
+ mmu->root.pgd = 0;
+ return 0;
}
static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
@@ -734,15 +767,26 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
rcu_read_lock();
/*
- * To avoid RCU stalls due to recursively removing huge swaths of SPs,
- * split the zap into two passes. On the first pass, zap at the 1gb
- * level, and then zap top-level SPs on the second pass. "1gb" is not
- * arbitrary, as KVM must be able to zap a 1gb shadow page without
- * inducing a stall to allow in-place replacement with a 1gb hugepage.
+ * Zap roots in multiple passes of decreasing granularity, i.e. zap at
+ * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
+ * preempt models) or mmu_lock contention (full or real-time models).
+ * Zapping at finer granularity marginally increases the total time of
+ * the zap, but in most cases the zap itself isn't latency sensitive.
*
- * Because zapping a SP recurses on its children, stepping down to
- * PG_LEVEL_4K in the iterator itself is unnecessary.
+ * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
+ * in order to mimic the page fault path, which can replace a 1GiB page
+ * table with an equivalent 1GiB hugepage, i.e. can get saddled with
+ * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
+ * allows verifying that KVM can safely zap 1GiB regions, e.g. without
+ * inducing RCU stalls, without relying on a relatively rare event
+ * (zapping roots is orders of magnitude more common). Note, because
+ * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
+ * in the iterator itself is unnecessary.
*/
+ if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
+ __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
+ }
__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
@@ -800,7 +844,13 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
tdp_mmu_iter_set_spte(kvm, &iter, 0);
- flush = true;
+
+ /*
+ * Zappings SPTEs in invalid roots doesn't require a TLB flush,
+ * see kvm_tdp_mmu_zap_invalidated_roots() for details.
+ */
+ if (!root->role.invalid)
+ flush = true;
}
rcu_read_unlock();
@@ -813,16 +863,16 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
}
/*
- * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
- * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
- * more SPTEs were zapped since the MMU lock was last acquired.
+ * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
+ * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
+ * one or more SPTEs were zapped since the MMU lock was last acquired.
*/
bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
{
struct kvm_mmu_page *root;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root_yield_safe(kvm, root)
+ for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
return flush;
@@ -896,7 +946,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
* the VM is being destroyed).
*
* Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
- * See kvm_tdp_mmu_get_vcpu_root_hpa().
+ * See kvm_tdp_mmu_alloc_root().
*/
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
{
@@ -1622,7 +1672,7 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
}
@@ -1740,7 +1790,7 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
bool spte_set = false;
lockdep_assert_held_write(&kvm->mmu_lock);
- for_each_tdp_mmu_root(kvm, root, slot->as_id)
+ for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
return spte_set;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 20d97aa46c49..6e1ea04ca885 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -10,7 +10,7 @@
void kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
+int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
{
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 87cc6c8809ad..c397b28e3d1b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -29,6 +29,9 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
+struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
+EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
+
/* Precise Distribution of Instructions Retired (PDIR) */
static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
@@ -67,7 +70,7 @@ static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
* all perf counters (both gp and fixed). The mapping relationship
* between pmc and perf counters is as the following:
* * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
- * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
*/
@@ -411,7 +414,7 @@ static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
int idx)
{
- int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+ int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
if (filter->action == KVM_PMU_EVENT_DENY &&
test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
@@ -441,11 +444,10 @@ static bool check_pmu_event_filter(struct kvm_pmc *pmc)
static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
{
return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
- static_call(kvm_x86_pmu_hw_event_available)(pmc) &&
check_pmu_event_filter(pmc);
}
-static void reprogram_counter(struct kvm_pmc *pmc)
+static int reprogram_counter(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
u64 eventsel = pmc->eventsel;
@@ -456,7 +458,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
emulate_overflow = pmc_pause_counter(pmc);
if (!pmc_event_is_allowed(pmc))
- goto reprogram_complete;
+ return 0;
if (emulate_overflow)
__kvm_perf_overflow(pmc, false);
@@ -466,7 +468,7 @@ static void reprogram_counter(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc)) {
fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED);
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
if (fixed_ctr_ctrl & 0x1)
eventsel |= ARCH_PERFMON_EVENTSEL_OS;
if (fixed_ctr_ctrl & 0x2)
@@ -477,43 +479,45 @@ static void reprogram_counter(struct kvm_pmc *pmc)
}
if (pmc->current_config == new_config && pmc_resume_counter(pmc))
- goto reprogram_complete;
+ return 0;
pmc_release_perf_event(pmc);
pmc->current_config = new_config;
- /*
- * If reprogramming fails, e.g. due to contention, leave the counter's
- * regprogram bit set, i.e. opportunistically try again on the next PMU
- * refresh. Don't make a new request as doing so can stall the guest
- * if reprogramming repeatedly fails.
- */
- if (pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
- (eventsel & pmu->raw_event_mask),
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT))
- return;
-
-reprogram_complete:
- clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
+ return pmc_reprogram_counter(pmc, PERF_TYPE_RAW,
+ (eventsel & pmu->raw_event_mask),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
}
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
int bit;
- for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
+ bitmap_copy(bitmap, pmu->reprogram_pmi, X86_PMC_IDX_MAX);
- if (unlikely(!pmc)) {
- clear_bit(bit, pmu->reprogram_pmi);
- continue;
- }
+ /*
+ * The reprogramming bitmap can be written asynchronously by something
+ * other than the task that holds vcpu->mutex, take care to clear only
+ * the bits that will actually processed.
+ */
+ BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
+ atomic64_andnot(*(s64 *)bitmap, &pmu->__reprogram_pmi);
- reprogram_counter(pmc);
+ kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
+ /*
+ * If reprogramming fails, e.g. due to contention, re-set the
+ * regprogram bit set, i.e. opportunistically try again on the
+ * next PMU refresh. Don't make a new request as doing so can
+ * stall the guest if reprogramming repeatedly fails.
+ */
+ if (reprogram_counter(pmc))
+ set_bit(pmc->idx, pmu->reprogram_pmi);
}
/*
@@ -525,10 +529,20 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
kvm_pmu_cleanup(vcpu);
}
-/* check if idx is a valid index to access PMU */
-bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
+ /*
+ * On Intel, VMX interception has priority over RDPMC exceptions that
+ * aren't already handled by the emulator, i.e. there are no additional
+ * check needed for Intel PMUs.
+ *
+ * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
+ * i.e. an invalid PMC results in a #GP, not #VMEXIT.
+ */
+ if (!kvm_pmu_ops.check_rdpmc_early)
+ return 0;
+
+ return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -567,10 +581,9 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
- bool fast_mode = idx & (1u << 31);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
- u64 mask = fast_mode ? ~0u : ~0ull;
+ u64 mask = ~0ull;
if (!pmu->version)
return 1;
@@ -716,11 +729,7 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
bitmap_zero(pmu->reprogram_pmi, X86_PMC_IDX_MAX);
- for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
- if (!pmc)
- continue;
-
+ kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
pmc_stop_counter(pmc);
pmc->counter = 0;
pmc->emulated_counter = 0;
@@ -741,6 +750,8 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
return;
@@ -750,8 +761,22 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
*/
kvm_pmu_reset(vcpu);
- bitmap_zero(vcpu_to_pmu(vcpu)->all_valid_pmc_idx, X86_PMC_IDX_MAX);
- static_call(kvm_x86_pmu_refresh)(vcpu);
+ pmu->version = 0;
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_status_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+ pmu->pebs_enable_mask = ~0ull;
+ pmu->pebs_data_cfg_mask = ~0ull;
+ bitmap_zero(pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+
+ if (vcpu->kvm->arch.enable_pmu)
+ static_call(kvm_x86_pmu_refresh)(vcpu);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -776,10 +801,8 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
bitmap_andnot(bitmask, pmu->all_valid_pmc_idx,
pmu->pmc_in_use, X86_PMC_IDX_MAX);
- for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
-
- if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
+ kvm_for_each_pmc(pmu, pmc, i, bitmask) {
+ if (pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}
@@ -799,13 +822,6 @@ static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
kvm_pmu_request_counter_reprogram(pmc);
}
-static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
- unsigned int perf_hw_id)
-{
- return !((pmc->eventsel ^ perf_get_hw_event_config(perf_hw_id)) &
- AMD64_RAW_EVENT_MASK_NB);
-}
-
static inline bool cpl_is_matched(struct kvm_pmc *pmc)
{
bool select_os, select_user;
@@ -817,29 +833,56 @@ static inline bool cpl_is_matched(struct kvm_pmc *pmc)
select_user = config & ARCH_PERFMON_EVENTSEL_USR;
} else {
config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED);
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX);
select_os = config & 0x1;
select_user = config & 0x2;
}
+ /*
+ * Skip the CPL lookup, which isn't free on Intel, if the result will
+ * be the same regardless of the CPL.
+ */
+ if (select_os == select_user)
+ return select_os;
+
return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user;
}
-void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
{
+ DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
int i;
- for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
+ BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
- if (!pmc || !pmc_event_is_allowed(pmc))
+ if (!kvm_pmu_has_perf_global_ctrl(pmu))
+ bitmap_copy(bitmap, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
+ else if (!bitmap_and(bitmap, pmu->all_valid_pmc_idx,
+ (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
+ return;
+
+ kvm_for_each_pmc(pmu, pmc, i, bitmap) {
+ /*
+ * Ignore checks for edge detect (all events currently emulated
+ * but KVM are always rising edges), pin control (unsupported
+ * by modern CPUs), and counter mask and its invert flag (KVM
+ * doesn't emulate multiple events in a single clock cycle).
+ *
+ * Note, the uppermost nibble of AMD's mask overlaps Intel's
+ * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
+ * bits (bits 35:34). Checking the "in HLE/RTM transaction"
+ * flags is correct as the vCPU can't be in a transaction if
+ * KVM is emulating an instruction. Checking the reserved bits
+ * might be wrong if they are defined in the future, but so
+ * could ignoring them, so do the simple thing for now.
+ */
+ if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) ||
+ !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc))
continue;
- /* Ignore checks for edge detect, pin control, invert and CMASK bits */
- if (eventsel_match_perf_hw_id(pmc, perf_hw_id) && cpl_is_matched(pmc))
- kvm_pmu_incr_counter(pmc);
+ kvm_pmu_incr_counter(pmc);
}
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7caeb3d8d4fd..4d52b0b539ba 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -4,6 +4,8 @@
#include <linux/nospec.h>
+#include <asm/kvm_host.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -18,13 +20,18 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
+#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED
+
+struct kvm_pmu_emulated_event_selectors {
+ u64 INSTRUCTIONS_RETIRED;
+ u64 BRANCH_INSTRUCTIONS_RETIRED;
+};
+
struct kvm_pmu_ops {
- bool (*hw_event_available)(struct kvm_pmc *pmc);
- struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
- bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx);
+ int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -55,6 +62,38 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
return pmu->version > 1;
}
+/*
+ * KVM tracks all counters in 64-bit bitmaps, with general purpose counters
+ * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0
+ * is tracked internally via index 32. On Intel, (AMD doesn't support fixed
+ * counters), this mirrors how fixed counters are mapped to PERF_GLOBAL_CTRL
+ * and similar MSRs, i.e. tracking fixed counters at base index 32 reduces the
+ * amounter of boilerplate needed to iterate over PMCs *and* simplifies common
+ * enabling/disable/reset operations.
+ *
+ * WARNING! This helper is only for lookups that are initiated by KVM, it is
+ * NOT safe for guest lookups, e.g. will do the wrong thing if passed a raw
+ * ECX value from RDPMC (fixed counters are accessed by setting bit 30 in ECX
+ * for RDPMC, not by adding 32 to the fixed counter index).
+ */
+static inline struct kvm_pmc *kvm_pmc_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+ if (idx < pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[idx];
+
+ idx -= KVM_FIXED_PMC_BASE_IDX;
+ if (idx >= 0 && idx < pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[idx];
+
+ return NULL;
+}
+
+#define kvm_for_each_pmc(pmu, pmc, i, bitmap) \
+ for_each_set_bit(i, bitmap, X86_PMC_IDX_MAX) \
+ if (!(pmc = kvm_pmc_idx_to_pmc(pmu, i))) \
+ continue; \
+ else \
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -131,12 +170,13 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
if (pmc_is_fixed(pmc))
return fixed_ctrl_field(pmu->fixed_ctr_ctrl,
- pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3;
+ pmc->idx - KVM_FIXED_PMC_BASE_IDX) & 0x3;
return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE;
}
extern struct x86_pmu_capability kvm_pmu_cap;
+extern struct kvm_pmu_emulated_event_selectors kvm_pmu_eventsel;
static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
@@ -178,6 +218,11 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
KVM_PMC_MAX_FIXED);
+
+ kvm_pmu_eventsel.INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED =
+ perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
}
static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc)
@@ -216,7 +261,7 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx);
+int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx);
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -225,7 +270,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu);
void kvm_pmu_cleanup(struct kvm_vcpu *vcpu);
void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp);
-void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id);
+void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel);
bool is_vmware_backdoor_pmc(u32 pmc_idx);
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index dc3d95fdca7d..d06d43d8d2aa 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -184,7 +184,6 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
struct kvm_smram_state_32 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
smram->cr0 = kvm_read_cr0(vcpu);
@@ -195,10 +194,8 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu,
for (i = 0; i < 8; i++)
smram->gprs[i] = kvm_register_read_raw(vcpu, i);
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = (u32)val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = (u32)val;
+ smram->dr6 = (u32)vcpu->arch.dr6;
+ smram->dr7 = (u32)vcpu->arch.dr7;
enter_smm_save_seg_32(vcpu, &smram->tr, &smram->tr_sel, VCPU_SREG_TR);
enter_smm_save_seg_32(vcpu, &smram->ldtr, &smram->ldtr_sel, VCPU_SREG_LDTR);
@@ -231,7 +228,6 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
struct kvm_smram_state_64 *smram)
{
struct desc_ptr dt;
- unsigned long val;
int i;
for (i = 0; i < 16; i++)
@@ -240,11 +236,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu,
smram->rip = kvm_rip_read(vcpu);
smram->rflags = kvm_get_rflags(vcpu);
-
- kvm_get_dr(vcpu, 6, &val);
- smram->dr6 = val;
- kvm_get_dr(vcpu, 7, &val);
- smram->dr7 = val;
+ smram->dr6 = vcpu->arch.dr6;
+ smram->dr7 = vcpu->arch.dr7;
smram->cr0 = kvm_read_cr0(vcpu);
smram->cr3 = kvm_read_cr3(vcpu);
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index b6a7ad4d6914..dfcc38bd97d3 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -25,7 +25,7 @@ enum pmu_type {
PMU_TYPE_EVNTSEL,
};
-static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+static struct kvm_pmc *amd_pmu_get_pmc(struct kvm_pmu *pmu, int pmc_idx)
{
unsigned int num_counters = pmu->nr_arch_gp_counters;
@@ -70,28 +70,24 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
return NULL;
}
- return amd_pmc_idx_to_pmc(pmu, idx);
+ return amd_pmu_get_pmc(pmu, idx);
}
-static bool amd_hw_event_available(struct kvm_pmc *pmc)
-{
- return true;
-}
-
-static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
+static int amd_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return -EINVAL;
- return idx < pmu->nr_arch_gp_counters;
+ return 0;
}
/* idx is the ECX register of RDPMC instruction */
static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask)
{
- return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30));
+ return amd_pmu_get_pmc(vcpu_to_pmu(vcpu), idx);
}
static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
@@ -233,11 +229,9 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops amd_pmu_ops __initdata = {
- .hw_event_available = amd_hw_event_available,
- .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = amd_msr_idx_to_pmc,
- .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx,
+ .check_rdpmc_early = amd_check_rdpmc_early,
.is_valid_msr = amd_is_valid_msr,
.get_msr = amd_pmu_get_msr,
.set_msr = amd_pmu_set_msr,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 272d5ed37ce7..d1a9f9951635 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2735,7 +2735,6 @@ static int dr_interception(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int reg, dr;
- unsigned long val;
int err = 0;
/*
@@ -2763,11 +2762,9 @@ static int dr_interception(struct kvm_vcpu *vcpu)
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
dr -= 16;
- val = kvm_register_read(vcpu, reg);
- err = kvm_set_dr(vcpu, dr, val);
+ err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
} else {
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
}
return kvm_complete_insn_gp(vcpu, err);
@@ -4092,6 +4089,9 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
to_svm(vcpu)->vmcb->control.exit_info_1)
return handle_fastpath_set_msr_irqoff(vcpu);
@@ -4115,12 +4115,13 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
guest_state_exit_irqoff();
}
-static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
+static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL);
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -4139,9 +4140,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
* is enough to force an immediate vmexit.
*/
disable_nmi_singlestep(svm);
- smp_send_reschedule(vcpu->cpu);
+ force_immediate_exit = true;
}
+ if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
pre_svm_run(vcpu);
sync_lapic_to_cr8(vcpu);
@@ -4237,9 +4241,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
return svm_exit_handlers_fastpath(vcpu);
}
@@ -5007,8 +5008,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_intercept = svm_check_intercept,
.handle_exit_irqoff = svm_handle_exit_irqoff,
- .request_immediate_exit = __kvm_request_immediate_exit,
-
.sched_in = svm_sched_in,
.nested_ops = &svm_nested_ops,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 83843379813e..88659de4d2a7 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -15,20 +15,23 @@
* Tracepoint for guest mode entry.
*/
TRACE_EVENT(kvm_entry,
- TP_PROTO(struct kvm_vcpu *vcpu),
- TP_ARGS(vcpu),
+ TP_PROTO(struct kvm_vcpu *vcpu, bool force_immediate_exit),
+ TP_ARGS(vcpu, force_immediate_exit),
TP_STRUCT__entry(
__field( unsigned int, vcpu_id )
__field( unsigned long, rip )
+ __field( bool, immediate_exit )
),
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
+ __entry->immediate_exit = force_immediate_exit;
),
- TP_printk("vcpu %u, rip 0x%lx", __entry->vcpu_id, __entry->rip)
+ TP_printk("vcpu %u, rip 0x%lx%s", __entry->vcpu_id, __entry->rip,
+ __entry->immediate_exit ? "[immediate exit]" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6329a306856b..d05ddf751491 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3606,7 +3606,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
return nested_vmx_failInvalid(vcpu);
@@ -4433,7 +4433,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 315c7c2ba89b..12ade343a17e 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -20,53 +20,19 @@
#include "nested.h"
#include "pmu.h"
-#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-
-enum intel_pmu_architectural_events {
- /*
- * The order of the architectural events matters as support for each
- * event is enumerated via CPUID using the index of the event.
- */
- INTEL_ARCH_CPU_CYCLES,
- INTEL_ARCH_INSTRUCTIONS_RETIRED,
- INTEL_ARCH_REFERENCE_CYCLES,
- INTEL_ARCH_LLC_REFERENCES,
- INTEL_ARCH_LLC_MISSES,
- INTEL_ARCH_BRANCHES_RETIRED,
- INTEL_ARCH_BRANCHES_MISPREDICTED,
-
- NR_REAL_INTEL_ARCH_EVENTS,
-
- /*
- * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a.
- * TSC reference cycles. The architectural reference cycles event may
- * or may not actually use the TSC as the reference, e.g. might use the
- * core crystal clock or the bus clock (yeah, "architectural").
- */
- PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS,
- NR_INTEL_ARCH_EVENTS,
-};
+/*
+ * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
+ * to encode the "type" of counter to read, i.e. this is not a "base". And to
+ * further confuse things, non-architectural PMUs use bit 31 as a flag for
+ * "fast" reads, whereas the "type" is an explicit value.
+ */
+#define INTEL_RDPMC_GP 0
+#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE
-static struct {
- u8 eventsel;
- u8 unit_mask;
-} const intel_arch_events[] = {
- [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 },
- [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 },
- [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 },
- [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f },
- [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 },
- [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 },
- [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 },
- [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 },
-};
+#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16)
+#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0)
-/* mapping between fixed pmc index and intel_arch_events array */
-static int fixed_pmc_events[] = {
- [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED,
- [1] = INTEL_ARCH_CPU_CYCLES,
- [2] = PSEUDO_ARCH_REFERENCE_CYCLES,
-};
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
@@ -84,77 +50,61 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
- __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
+ __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use);
kvm_pmu_request_counter_reprogram(pmc);
}
}
-static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
-{
- if (pmc_idx < INTEL_PMC_IDX_FIXED) {
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
- MSR_P6_EVNTSEL0);
- } else {
- u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
-
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
- }
-}
-
-static bool intel_hw_event_available(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS);
-
- /*
- * Disallow events reported as unavailable in guest CPUID. Note, this
- * doesn't apply to pseudo-architectural events.
- */
- for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) {
- if (intel_arch_events[i].eventsel != event_select ||
- intel_arch_events[i].unit_mask != unit_mask)
- continue;
-
- return pmu->available_event_types & BIT(i);
- }
-
- return true;
-}
-
-static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
-
- idx &= ~(3u << 30);
-
- return fixed ? idx < pmu->nr_arch_fixed_counters
- : idx < pmu->nr_arch_gp_counters;
-}
-
static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask)
{
+ unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
unsigned int num_counters;
+ u64 bitmask;
- idx &= ~(3u << 30);
- if (fixed) {
+ /*
+ * The encoding of ECX for RDPMC is different for architectural versus
+ * non-architecturals PMUs (PMUs with version '0'). For architectural
+ * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC
+ * index. For non-architectural PMUs, bit 31 is a "fast" flag, and
+ * bits 30:0 specify the PMC index.
+ *
+ * Yell and reject attempts to read PMCs for a non-architectural PMU,
+ * as KVM doesn't support such PMUs.
+ */
+ if (WARN_ON_ONCE(!pmu->version))
+ return NULL;
+
+ /*
+ * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs
+ * are supported on all architectural PMUs, i.e. on all virtual PMUs
+ * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+,
+ * but the type itself is still valid, i.e. let RDPMC fail due to
+ * accessing a non-existent counter. Reject attempts to read all other
+ * types, which are unknown/unsupported.
+ */
+ switch (type) {
+ case INTEL_RDPMC_FIXED:
counters = pmu->fixed_counters;
num_counters = pmu->nr_arch_fixed_counters;
- } else {
+ bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ case INTEL_RDPMC_GP:
counters = pmu->gp_counters;
num_counters = pmu->nr_arch_gp_counters;
+ bitmask = pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ default:
+ return NULL;
}
+
+ idx &= INTEL_RDPMC_INDEX_MASK;
if (idx >= num_counters)
return NULL;
- *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+
+ *mask &= bitmask;
return &counters[array_index_nospec(idx, num_counters)];
}
@@ -464,20 +414,38 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+/*
+ * Map fixed counter events to architectural general purpose event encodings.
+ * Perf doesn't provide APIs to allow KVM to directly program a fixed counter,
+ * and so KVM instead programs the architectural event to effectively request
+ * the fixed counter. Perf isn't guaranteed to use a fixed counter and may
+ * instead program the encoding into a general purpose counter, e.g. if a
+ * different perf_event is already utilizing the requested counter, but the end
+ * result is the same (ignoring the fact that using a general purpose counter
+ * will likely exacerbate counter contention).
+ *
+ * Forcibly inlined to allow asserting on @index at build time, and there should
+ * never be more than one user.
+ */
+static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
{
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED);
+ const enum perf_hw_id fixed_pmc_perf_ids[] = {
+ [0] = PERF_COUNT_HW_INSTRUCTIONS,
+ [1] = PERF_COUNT_HW_CPU_CYCLES,
+ [2] = PERF_COUNT_HW_REF_CPU_CYCLES,
+ };
+ u64 eventsel;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- int index = array_index_nospec(i, KVM_PMC_MAX_FIXED);
- struct kvm_pmc *pmc = &pmu->fixed_counters[index];
- u32 event = fixed_pmc_events[index];
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(index >= KVM_PMC_MAX_FIXED);
- pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
- intel_arch_events[event].eventsel;
- }
+ /*
+ * Yell if perf reports support for a fixed counter but perf doesn't
+ * have a known encoding for the associated general purpose event.
+ */
+ eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]);
+ WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed);
+ return eventsel;
}
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
@@ -491,19 +459,6 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
u64 counter_mask;
int i;
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
- pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
-
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
/*
@@ -515,8 +470,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
entry = kvm_find_cpuid_entry(vcpu, 0xa);
- if (!entry || !vcpu->kvm->arch.enable_pmu)
+ if (!entry)
return;
+
eax.full = entry->eax;
edx.full = entry->edx;
@@ -543,13 +499,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
- setup_fixed_pmc_eventsel(pmu);
}
for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
pmu->global_ctrl_mask = counter_mask;
/*
@@ -593,7 +548,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
+ ~(1ULL << (KVM_FIXED_PMC_BASE_IDX + i * 4));
}
pmu->pebs_data_cfg_mask = ~0xff00000full;
} else {
@@ -619,8 +574,9 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
pmu->fixed_counters[i].current_config = 0;
+ pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i);
}
lbr_desc->records.nr = 0;
@@ -748,11 +704,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
struct kvm_pmc *pmc = NULL;
int bit, hw_idx;
- for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
- X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
-
- if (!pmc || !pmc_speculative_in_use(pmc) ||
+ kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) {
+ if (!pmc_speculative_in_use(pmc) ||
!pmc_is_globally_enabled(pmc) || !pmc->perf_event)
continue;
@@ -767,11 +720,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
}
struct kvm_pmu_ops intel_pmu_ops __initdata = {
- .hw_event_available = intel_hw_event_available,
- .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 305237dcba88..c37a89eda90f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -50,6 +50,8 @@
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
#include "cpuid.h"
#include "hyperv.h"
@@ -160,7 +162,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
/*
* List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic and PT MSRs are handled specially.
+ * In addition to these x2apic, PT and LBR MSRs are handled specially.
*/
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
@@ -668,25 +670,14 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int possible_passthrough_msr_slot(u32 msr)
-{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
-
- return -ENOENT;
-}
-
-static bool is_valid_passthrough_msr(u32 msr)
+static int vmx_get_passthrough_msr_slot(u32 msr)
{
- bool r;
+ int i;
switch (msr) {
case 0x800 ... 0x8ff:
/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return true;
+ return -ENOENT;
case MSR_IA32_RTIT_STATUS:
case MSR_IA32_RTIT_OUTPUT_BASE:
case MSR_IA32_RTIT_OUTPUT_MASK:
@@ -701,14 +692,16 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return true;
+ return -ENOENT;
}
- r = possible_passthrough_msr_slot(msr) != -ENOENT;
-
- WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+ }
- return r;
+ WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ return -ENOENT;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
@@ -1291,8 +1284,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -3964,6 +3955,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3973,16 +3965,13 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filters change.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
}
if ((type & MSR_TYPE_R) &&
@@ -4008,6 +3997,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -4017,16 +4007,13 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filter changes.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
}
if (type & MSR_TYPE_R)
@@ -4137,6 +4124,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
/*
* Redo intercept permissions for MSRs that KVM is passing through to
* the guest. Disabling interception will check the new MSR filter and
@@ -5576,10 +5566,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
err = 0;
} else {
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
@@ -6001,22 +5988,46 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
@@ -6519,7 +6530,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
+ vcpu->run->internal.data[2] = vmx_get_exit_qual(vcpu);
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -7158,13 +7169,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7217,13 +7228,22 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
barrier_nospec();
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7286,7 +7306,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7313,7 +7333,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -7372,7 +7392,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
@@ -7436,10 +7458,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
@@ -7919,11 +7938,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -8376,8 +8390,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.check_intercept = vmx_check_intercept,
.handle_exit_irqoff = vmx_handle_exit_irqoff,
- .request_immediate_exit = vmx_request_immediate_exit,
-
.sched_in = vmx_sched_in,
.cpu_dirty_log_size = PML_ENTITY_NUM,
@@ -8637,7 +8649,6 @@ static __init int hardware_setup(void)
if (!enable_preemption_timer) {
vmx_x86_ops.set_hv_timer = NULL;
vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..65786dbe7d60 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -332,8 +332,6 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- bool req_immediate_exit;
-
/* Support for PML */
#define PML_ENTITY_NUM 512
struct page *pml_pg;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffe580169c93..47d9f03b7778 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1399,22 +1399,19 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
{
size_t size = ARRAY_SIZE(vcpu->arch.db);
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[array_index_nospec(dr, size)];
- break;
+ return vcpu->arch.db[array_index_nospec(dr, size)];
case 4:
case 6:
- *val = vcpu->arch.dr6;
- break;
+ return vcpu->arch.dr6;
case 5:
default: /* 7 */
- *val = vcpu->arch.dr7;
- break;
+ return vcpu->arch.dr7;
}
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -2860,7 +2857,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
return v * clock->mult;
}
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
+ * As with get_kvmclock_base_ns(), this counts from boot time, at the
+ * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
+ */
+static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2879,6 +2880,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
return mode;
}
+/*
+ * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
+ * no boot time offset.
+ */
+static int do_monotonic(s64 *t, u64 *tsc_timestamp)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
+ ns >>= gtod->clock.shift;
+ ns += ktime_to_ns(gtod->clock.offset);
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ *t = ns;
+
+ return mode;
+}
+
static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@@ -2900,18 +2924,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
return mode;
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
+ * reports the TSC value from which it do so. Returns true if host is
+ * using TSC based clocksource.
+ */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
{
/* checked again under seqlock below */
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
- tsc_timestamp));
+ return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
+ tsc_timestamp));
}
-/* returns true if host is using TSC based clocksource */
+/*
+ * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ */
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
+{
+ /* checked again under seqlock below */
+ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
+ return false;
+
+ return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
+ tsc_timestamp));
+}
+
+/*
+ * Calculates CLOCK_REALTIME and reports the TSC value from which it did
+ * so. Returns true if host is using TSC based clocksource.
+ *
+ * DO NOT USE this for anything related to migration. You want CLOCK_TAI
+ * for that.
+ */
static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
u64 *tsc_timestamp)
{
@@ -3158,7 +3206,7 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
guest_hv_clock->version = ++vcpu->hv_clock.version;
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
@@ -4680,7 +4728,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
KVM_XEN_HVM_CONFIG_SHARED_INFO |
KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
KVM_XEN_HVM_CONFIG_EVTCHN_SEND |
- KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
+ KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE |
+ KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
@@ -5064,8 +5113,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
int idx;
if (vcpu->preempted) {
- if (!vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu);
/*
* Take the srcu lock as memslots will be accessed to check the gfn
@@ -5512,18 +5560,23 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- unsigned long val;
+ unsigned int i;
memset(dbgregs, 0, sizeof(*dbgregs));
- memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- kvm_get_dr(vcpu, 6, &val);
- dbgregs->dr6 = val;
+
+ BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ dbgregs->db[i] = vcpu->arch.db[i];
+
+ dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
+ unsigned int i;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5532,7 +5585,9 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (!kvm_dr7_valid(dbgregs->dr7))
return -EINVAL;
- memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++)
+ vcpu->arch.db[i] = dbgregs->db[i];
+
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
@@ -8180,10 +8235,9 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
}
-static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
- unsigned long *dest)
+static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr)
{
- kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr);
}
static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
@@ -8405,12 +8459,9 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
}
-static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
- u32 pmc)
+static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
{
- if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
- return 0;
- return -EINVAL;
+ return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -8542,7 +8593,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_msr_with_filter = emulator_set_msr_with_filter,
.get_msr_with_filter = emulator_get_msr_with_filter,
.get_msr = emulator_get_msr,
- .check_pmc = emulator_check_pmc,
+ .check_rdpmc_early = emulator_check_rdpmc_early,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -8803,31 +8854,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
- /* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->root_role.direct) {
- unsigned int indirect_shadow_pages;
-
- write_lock(&vcpu->kvm->mmu_lock);
- indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
- write_unlock(&vcpu->kvm->mmu_lock);
-
- if (indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
- return true;
- }
-
/*
- * if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-enter the
- * guest to let CPU execute the instruction.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ if (vcpu->kvm->arch.indirect_shadow_pages)
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
/*
- * If the access faults on its page table, it can not
- * be fixed by unprotecting shadow page and it should
- * be reported to userspace.
+ * If the failed instruction faulted on an access to page tables that
+ * are used to translate any part of the instruction, KVM can't resolve
+ * the issue by unprotecting the gfn, as zapping the shadow page will
+ * result in the instruction taking a !PRESENT page fault and thus put
+ * the vCPU into an infinite loop of page faults. E.g. KVM will create
+ * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and
+ * then zap the SPTE to unprotect the gfn, and then do it all over
+ * again. Report the error to userspace.
*/
return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
}
@@ -8922,7 +8966,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -9235,9 +9279,9 @@ writeback:
*/
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
if (ctxt->is_branch)
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -9648,11 +9692,13 @@ static void kvm_x86_check_cpu_compat(void *ret)
*(int *)ret = kvm_x86_check_processor_compatibility();
}
-static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
u64 host_pat;
int r, cpu;
+ guard(mutex)(&vendor_module_lock);
+
if (kvm_x86_ops.hardware_enable) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
@@ -9782,17 +9828,6 @@ out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
}
-
-int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
-{
- int r;
-
- mutex_lock(&vendor_module_lock);
- r = __kvm_x86_vendor_init(ops);
- mutex_unlock(&vendor_module_lock);
-
- return r;
-}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
void kvm_x86_vendor_exit(void)
@@ -10689,12 +10724,6 @@ static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
}
-void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- smp_send_reschedule(vcpu->cpu);
-}
-EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
-
/*
* Called within kvm->srcu read side.
* Returns 1 to let vcpu_run() continue the guest execution loop without
@@ -10944,10 +10973,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit) {
+ if (req_immediate_exit)
kvm_make_request(KVM_REQ_EVENT, vcpu);
- static_call(kvm_x86_request_immediate_exit)(vcpu);
- }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -10978,7 +11005,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
+ exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -12065,7 +12092,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -12076,27 +12103,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (r < 0)
return r;
- if (irqchip_in_kernel(vcpu->kvm)) {
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
-
- /*
- * Defer evaluating inhibits until the vCPU is first run, as
- * this vCPU will not get notified of any changes until this
- * vCPU is visible to other vCPUs (marked online and added to
- * the set of vCPUs). Opportunistically mark APICv active as
- * VMX in particularly is highly unlikely to have inhibits.
- * Ignore the current per-VM APICv state so that vCPU creation
- * is guaranteed to run with a deterministic value, the request
- * will ensure the vCPU gets the correct state before VM-Entry.
- */
- if (enable_apicv) {
- vcpu->arch.apic->apicv_active = true;
- kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
- }
- } else
- static_branch_inc(&kvm_has_noapic_vcpu);
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
r = -ENOMEM;
@@ -12217,8 +12226,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
kvfree(vcpu->arch.cpuid_entries);
- if (!lapic_in_kernel(vcpu))
- static_branch_dec(&kvm_has_noapic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -12495,9 +12502,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
-EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -13100,11 +13104,13 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
- if (kvm_vcpu_apicv_active(vcpu) &&
- static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
- return true;
+ return kvm_vcpu_apicv_active(vcpu) &&
+ static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu);
+}
- return false;
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
}
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
@@ -13127,9 +13133,6 @@ bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return true;
- if (vcpu != kvm_get_running_vcpu())
- return vcpu->arch.preempted_in_kernel;
-
return static_call(kvm_x86_get_cpl)(vcpu) == 0;
}
@@ -13924,9 +13927,6 @@ module_init(kvm_x86_init);
static void __exit kvm_x86_exit(void)
{
- /*
- * If module_init() is implemented, module_exit() must also be
- * implemented to allow module unload.
- */
+ WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu));
}
module_exit(kvm_x86_exit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2f7e19166658..a8b71803777b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
+bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
gva_t addr, void *val, unsigned int bytes,
@@ -431,12 +432,6 @@ static inline bool kvm_notify_vmexit_enabled(struct kvm *kvm)
return kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_ENABLED;
}
-enum kvm_intr_type {
- /* Values are arbitrary, but must be non-zero. */
- KVM_HANDLING_IRQ = 1,
- KVM_HANDLING_NMI,
-};
-
static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
enum kvm_intr_type intr)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 4b4e738c6f1b..f65b35a05d91 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -10,7 +10,7 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
-#include "lapic.h"
+#include "irq.h"
#include <linux/eventfd.h>
#include <linux/kvm_host.h>
@@ -24,6 +24,7 @@
#include <xen/interface/sched.h>
#include <asm/xen/cpuid.h>
+#include <asm/pvclock.h>
#include "cpuid.h"
#include "trace.h"
@@ -34,41 +35,32 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
-static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
+static int kvm_xen_shared_info_init(struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct pvclock_wall_clock *wc;
- gpa_t gpa = gfn_to_gpa(gfn);
u32 *wc_sec_hi;
u32 wc_version;
u64 wall_nsec;
int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- if (gfn == KVM_XEN_INVALID_GFN) {
- kvm_gpc_deactivate(gpc);
- goto out;
- }
+ read_lock_irq(&gpc->lock);
+ while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
+ read_unlock_irq(&gpc->lock);
- do {
- ret = kvm_gpc_activate(gpc, gpa, PAGE_SIZE);
+ ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
if (ret)
goto out;
- /*
- * This code mirrors kvm_write_wall_clock() except that it writes
- * directly through the pfn cache and doesn't mark the page dirty.
- */
- wall_nsec = kvm_get_wall_clock_epoch(kvm);
-
- /* It could be invalid again already, so we need to check */
read_lock_irq(&gpc->lock);
+ }
- if (gpc->valid)
- break;
-
- read_unlock_irq(&gpc->lock);
- } while (1);
+ /*
+ * This code mirrors kvm_write_wall_clock() except that it writes
+ * directly through the pfn cache and doesn't mark the page dirty.
+ */
+ wall_nsec = kvm_get_wall_clock_epoch(kvm);
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -158,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
+ bool linux_wa)
{
+ int64_t kernel_now, delta;
+ uint64_t guest_now;
+
+ /*
+ * The guest provides the requested timeout in absolute nanoseconds
+ * of the KVM clock — as *it* sees it, based on the scaled TSC and
+ * the pvclock information provided by KVM.
+ *
+ * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
+ * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
+ * difference won't matter much as there is no cumulative effect.
+ *
+ * Calculate the time for some arbitrary point in time around "now"
+ * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
+ * delta between the kvmclock "now" value and the guest's requested
+ * timeout, apply the "Linux workaround" described below, and add
+ * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
+ * the absolute CLOCK_MONOTONIC time at which the timer should
+ * fire.
+ */
+ if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
+ static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ uint64_t host_tsc, guest_tsc;
+
+ if (!IS_ENABLED(CONFIG_64BIT) ||
+ !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
+ /*
+ * Don't fall back to get_kvmclock_ns() because it's
+ * broken; it has a systemic error in its results
+ * because it scales directly from host TSC to
+ * nanoseconds, and doesn't scale first to guest TSC
+ * and *then* to nanoseconds as the guest does.
+ *
+ * There is a small error introduced here because time
+ * continues to elapse between the ktime_get() and the
+ * subsequent rdtsc(). But not the systemic drift due
+ * to get_kvmclock_ns().
+ */
+ kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
+ host_tsc = rdtsc();
+ }
+
+ /* Calculate the guest kvmclock as the guest would do it. */
+ guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
+ guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
+ guest_tsc);
+ } else {
+ /*
+ * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
+ *
+ * Also if the guest PV clock hasn't been set up yet, as is
+ * likely to be the case during migration when the vCPU has
+ * not been run yet. It would be possible to calculate the
+ * scaling factors properly in that case but there's not much
+ * point in doing so. The get_kvmclock_ns() drift accumulates
+ * over time, so it's OK to use it at startup. Besides, on
+ * migration there's going to be a little bit of skew in the
+ * precise moment at which timers fire anyway. Often they'll
+ * be in the "past" by the time the VM is running again after
+ * migration.
+ */
+ guest_now = get_kvmclock_ns(vcpu->kvm);
+ kernel_now = ktime_get();
+ }
+
+ delta = guest_abs - guest_now;
+
+ /*
+ * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
+ * negative absolute timeout values (caused by integer overflow), and
+ * for values about 13 days in the future (2^50ns) which would be
+ * caused by jiffies overflow. For those cases, Xen sets the timeout
+ * 100ms in the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep churning CPU
+ * time by waking it up). Emulate Xen's workaround when starting the
+ * timer in response to __HYPERVISOR_set_timer_op.
+ */
+ if (linux_wa &&
+ unlikely((int64_t)guest_abs < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ guest_abs = guest_now + delta;
+ }
+
/*
* Avoid races with the old timer firing. Checking timer_expires
* to avoid calling hrtimer_cancel() will only have false positives
@@ -171,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
atomic_set(&vcpu->arch.xen.timer_pending, 0);
vcpu->arch.xen.timer_expires = guest_abs;
- if (delta_ns <= 0) {
+ if (delta <= 0)
xen_timer_callback(&vcpu->arch.xen.timer);
- } else {
- ktime_t ktime_now = ktime_get();
+ else
hrtimer_start(&vcpu->arch.xen.timer,
- ktime_add_ns(ktime_now, delta_ns),
+ ktime_add_ns(kernel_now, delta),
HRTIMER_MODE_ABS_HARD);
- }
}
static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@@ -452,14 +527,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
smp_wmb();
}
- if (user_len2)
+ if (user_len2) {
+ kvm_gpc_mark_dirty_in_slot(gpc2);
read_unlock(&gpc2->lock);
+ }
+ kvm_gpc_mark_dirty_in_slot(gpc1);
read_unlock_irqrestore(&gpc1->lock, flags);
-
- mark_page_dirty_in_slot(v->kvm, gpc1->memslot, gpc1->gpa >> PAGE_SHIFT);
- if (user_len2)
- mark_page_dirty_in_slot(v->kvm, gpc2->memslot, gpc2->gpa >> PAGE_SHIFT);
}
void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -493,10 +567,9 @@ void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
kvm_xen_update_runstate_guest(v, state == RUNSTATE_runnable);
}
-static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
{
struct kvm_lapic_irq irq = { };
- int r;
irq.dest_id = v->vcpu_id;
irq.vector = v->arch.xen.upcall_vector;
@@ -505,8 +578,7 @@ static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
irq.delivery_mode = APIC_DM_FIXED;
irq.level = 1;
- /* The fast version will always work for physical unicast */
- WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+ kvm_irq_delivery_to_apic(v->kvm, NULL, &irq, NULL);
}
/*
@@ -565,13 +637,13 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
: "0" (evtchn_pending_sel32));
WRITE_ONCE(vi->evtchn_upcall_pending, 1);
}
+
+ kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
/* For the per-vCPU lapic vector, deliver it as MSI. */
if (v->arch.xen.upcall_vector)
kvm_xen_inject_vcpu_vector(v);
-
- mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
@@ -635,17 +707,59 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
} else {
mutex_lock(&kvm->arch.xen.xen_lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place. Whilst it's not necessary to do this
+ * unless the mode is actually changed, it does no harm
+ * to make the call anyway.
+ */
+ r = kvm->arch.xen.shinfo_cache.active ?
+ kvm_xen_shared_info_init(kvm) : 0;
mutex_unlock(&kvm->arch.xen.xen_lock);
- r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
+ int idx;
+
mutex_lock(&kvm->arch.xen.xen_lock);
- r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
+ gfn_t gfn = data->u.shared_info.gfn;
+
+ if (gfn == KVM_XEN_INVALID_GFN) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate(&kvm->arch.xen.shinfo_cache,
+ gfn_to_gpa(gfn), PAGE_SIZE);
+ }
+ } else {
+ void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
+
+ if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) {
+ r = -EINVAL;
+ } else if (!hva) {
+ kvm_gpc_deactivate(&kvm->arch.xen.shinfo_cache);
+ r = 0;
+ } else {
+ r = kvm_gpc_activate_hva(&kvm->arch.xen.shinfo_cache,
+ (unsigned long)hva, PAGE_SIZE);
+ }
+ }
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (!r && kvm->arch.xen.shinfo_cache.active)
+ r = kvm_xen_shared_info_init(kvm);
+
mutex_unlock(&kvm->arch.xen.xen_lock);
break;
-
+ }
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
@@ -699,13 +813,21 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_cache.active)
+ if (kvm_gpc_is_gpa_active(&kvm->arch.xen.shinfo_cache))
data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
else
data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&kvm->arch.xen.shinfo_cache))
+ data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
+ else
+ data->u.shared_info.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
data->u.vector = kvm->arch.xen.upcall_vector;
r = 0;
@@ -742,20 +864,33 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
/* No compat necessary here. */
BUILD_BUG_ON(sizeof(struct vcpu_info) !=
sizeof(struct compat_vcpu_info));
BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
offsetof(struct compat_vcpu_info, time));
- if (data->u.gpa == KVM_XEN_INVALID_GPA) {
- kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
- r = 0;
- break;
+ if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
+ if (data->u.gpa == KVM_XEN_INVALID_GPA) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.gpa, sizeof(struct vcpu_info));
+ } else {
+ if (data->u.hva == 0) {
+ kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache);
+ r = 0;
+ break;
+ }
+
+ r = kvm_gpc_activate_hva(&vcpu->arch.xen.vcpu_info_cache,
+ data->u.hva, sizeof(struct vcpu_info));
}
- r = kvm_gpc_activate(&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa, sizeof(struct vcpu_info));
if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -944,9 +1079,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
/* Start the timer if the new value has a valid vector+expiry. */
if (data->u.timer.port && data->u.timer.expires_ns)
- kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
- data->u.timer.expires_ns -
- get_kvmclock_ns(vcpu->kvm));
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
r = 0;
break;
@@ -977,13 +1110,21 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_cache.active)
+ if (kvm_gpc_is_gpa_active(&vcpu->arch.xen.vcpu_info_cache))
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = KVM_XEN_INVALID_GPA;
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
+ if (kvm_gpc_is_hva_active(&vcpu->arch.xen.vcpu_info_cache))
+ data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
+ else
+ data->u.hva = 0;
+ r = 0;
+ break;
+
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
@@ -1093,9 +1234,24 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
bool lm = is_long_mode(vcpu);
+ int r = 0;
+
+ mutex_lock(&kvm->arch.xen.xen_lock);
+ if (kvm->arch.xen.long_mode != lm) {
+ kvm->arch.xen.long_mode = lm;
+
+ /*
+ * Re-initialize shared_info to put the wallclock in the
+ * correct place.
+ */
+ if (kvm->arch.xen.shinfo_cache.active &&
+ kvm_xen_shared_info_init(kvm))
+ r = 1;
+ }
+ mutex_unlock(&kvm->arch.xen.xen_lock);
- /* Latch long_mode for shared_info pages etc. */
- vcpu->kvm->arch.xen.long_mode = lm;
+ if (r)
+ return r;
/*
* If Xen hypercall intercept is enabled, fill the hypercall
@@ -1396,7 +1552,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
{
struct vcpu_set_singleshot_timer oneshot;
struct x86_exception e;
- s64 delta;
if (!kvm_xen_timer_enabled(vcpu))
return false;
@@ -1430,9 +1585,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
return true;
}
- /* A delta <= 0 results in an immediate callback, which is what we want */
- delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
- kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
*r = 0;
return true;
@@ -1455,29 +1608,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
if (!kvm_xen_timer_enabled(vcpu))
return false;
- if (timeout) {
- uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
- int64_t delta = timeout - guest_now;
-
- /* Xen has a 'Linux workaround' in do_set_timer_op() which
- * checks for negative absolute timeout values (caused by
- * integer overflow), and for values about 13 days in the
- * future (2^50ns) which would be caused by jiffies
- * overflow. For those cases, it sets the timeout 100ms in
- * the future (not *too* soon, since if a guest really did
- * set a long timeout on purpose we don't want to keep
- * churning CPU time by waking it up).
- */
- if (unlikely((int64_t)timeout < 0 ||
- (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
- delta = 100 * NSEC_PER_MSEC;
- timeout = guest_now + delta;
- }
-
- kvm_xen_start_timer(vcpu, timeout, delta);
- } else {
+ if (timeout)
+ kvm_xen_start_timer(vcpu, timeout, true);
+ else
kvm_xen_stop_timer(vcpu);
- }
*r = 0;
return true;
@@ -1621,9 +1755,6 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
}
- if (!vcpu->arch.xen.vcpu_info_cache.active)
- return -EINVAL;
-
if (xe->port >= max_evtchn_port(kvm))
return -EINVAL;
@@ -1731,8 +1862,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
mm_borrowed = true;
}
- mutex_lock(&kvm->arch.xen.xen_lock);
-
/*
* It is theoretically possible for the page to be unmapped
* and the MMU notifier to invalidate the shared_info before
@@ -1760,8 +1889,6 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
srcu_read_unlock(&kvm->srcu, idx);
} while(!rc);
- mutex_unlock(&kvm->arch.xen.xen_lock);
-
if (mm_borrowed)
kthread_unuse_mm(kvm->mm);
@@ -2109,14 +2236,10 @@ void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
- kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
- kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm, NULL,
- KVM_HOST_USES_PFN);
+ kvm_gpc_init(&vcpu->arch.xen.runstate_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.runstate2_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_info_cache, vcpu->kvm);
+ kvm_gpc_init(&vcpu->arch.xen.vcpu_time_info_cache, vcpu->kvm);
}
void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
@@ -2159,7 +2282,7 @@ void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
idr_init(&kvm->arch.xen.evtchn_ports);
- kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN);
+ kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm);
}
void kvm_xen_destroy_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index f8f1fe22d090..f5841d9000ae 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -18,6 +18,7 @@ extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -36,6 +37,19 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue);
void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * The local APIC is being enabled. If the per-vCPU upcall vector is
+ * set and the vCPU's evtchn_upcall_pending flag is set, inject the
+ * interrupt.
+ */
+ if (static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->arch.xen.upcall_vector && __kvm_xen_has_interrupt(vcpu))
+ kvm_xen_inject_vcpu_vector(vcpu);
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return static_branch_unlikely(&kvm_xen_enabled.key) &&
@@ -101,6 +115,10 @@ static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
{
}
+static inline void kvm_xen_sw_enable_lapic(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;