summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h10
-rw-r--r--arch/x86/kvm/vmx/hyperv.c1
-rw-r--r--arch/x86/kvm/vmx/hyperv.h2
-rw-r--r--arch/x86/kvm/vmx/hyperv_evmcs.h2
-rw-r--r--arch/x86/kvm/vmx/main.c170
-rw-r--r--arch/x86/kvm/vmx/nested.c446
-rw-r--r--arch/x86/kvm/vmx/nested.h8
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c272
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c4
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h101
-rw-r--r--arch/x86/kvm/vmx/sgx.c21
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h14
-rw-r--r--arch/x86/kvm/vmx/vmenter.S2
-rw-r--r--arch/x86/kvm/vmx/vmx.c1223
-rw-r--r--arch/x86/kvm/vmx/vmx.h37
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h10
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h18
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h124
19 files changed, 1375 insertions, 1095 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 41a4533f9989..cb6588238f46 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -54,9 +54,7 @@ struct nested_vmx_msrs {
};
struct vmcs_config {
- int size;
- u32 basic_cap;
- u32 revision_id;
+ u64 basic;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
@@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
- return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ return vmcs_config.basic & VMX_BASIC_INOUT;
}
static inline bool cpu_has_virtual_nmis(void)
@@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void)
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
/* check if the cpu supports writing r/o exit information fields */
- if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void)
static inline bool cpu_has_vmx_intel_pt(void)
{
- return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index fab6a1ad98dc..fa41d036acd4 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/smp.h>
+#include "x86.h"
#include "../cpuid.h"
#include "hyperv.h"
#include "nested.h"
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index a87407412615..11a339009781 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -42,7 +42,7 @@ static inline struct hv_enlightened_vmcs *nested_vmx_evmcs(struct vcpu_vmx *vmx)
return vmx->nested.hv_evmcs;
}
-static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
+static inline bool guest_cpu_cap_has_evmcs(struct kvm_vcpu *vcpu)
{
/*
* eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
diff --git a/arch/x86/kvm/vmx/hyperv_evmcs.h b/arch/x86/kvm/vmx/hyperv_evmcs.h
index a543fccfc574..6536290f4274 100644
--- a/arch/x86/kvm/vmx/hyperv_evmcs.h
+++ b/arch/x86/kvm/vmx/hyperv_evmcs.h
@@ -6,7 +6,7 @@
#ifndef __KVM_X86_VMX_HYPERV_EVMCS_H
#define __KVM_X86_VMX_HYPERV_EVMCS_H
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include "capabilities.h"
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..43ee9ed11291
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "nested.h"
+#include "pmu.h"
+#include "posted_intr.h"
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLED) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vmx_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+
+ .vcpu_precreate = vmx_vcpu_precreate,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_feature_msr = vmx_get_feature_msr,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cpl_no_cache = vmx_get_cpl_no_cache,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr6 = vmx_set_dr6,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
+
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
+
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
+ .inject_exception = vmx_inject_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_interrupt = vmx_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+ .get_entry_info = vmx_get_entry_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .cpu_dirty_log_size = PML_LOG_NR_ENTRIES,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vmx_smi_allowed,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
+ .enable_smi_window = vmx_enable_smi_window,
+#endif
+
+ .check_emulate_instruction = vmx_check_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vmx_hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6329a306856b..ed8a3cb53961 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,15 +7,16 @@
#include <asm/debugreg.h>
#include <asm/mmu_context.h>
+#include "x86.h"
#include "cpuid.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
+#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
-#include "x86.h"
#include "smm.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -230,11 +231,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (nested_vmx_is_evmptr12_valid(vmx)) {
- kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs = NULL;
- }
-
+ kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map);
+ vmx->nested.hv_evmcs = NULL;
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
if (hv_vcpu) {
@@ -259,7 +257,7 @@ static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
* state. It is possible that the area will stay mapped as
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
- if (!guest_cpuid_has_evmcs(vcpu) ||
+ if (!guest_cpu_cap_has_evmcs(vcpu) ||
!evmptr_is_valid(nested_get_evmptr(vcpu)))
return false;
@@ -316,6 +314,16 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vcpu->arch.regs_dirty = 0;
}
+static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map);
+ vmx->nested.pi_desc = NULL;
+}
+
/*
* Free whatever needs to be freed from vmx->nested when L1 goes down, or
* just stops using VMX.
@@ -348,15 +356,8 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.cached_vmcs12 = NULL;
kfree(vmx->nested.cached_shadow_vmcs12);
vmx->nested.cached_shadow_vmcs12 = NULL;
- /*
- * Unpin physical memory we referred to in the vmcs02. The APIC access
- * page's backing page (yeah, confusing) shouldn't actually be accessed,
- * and if it is written, the contents are irrelevant.
- */
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
+
+ nested_put_vmcs12_pages(vcpu);
kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
@@ -409,18 +410,40 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long exit_qualification;
u32 vm_exit_reason;
- unsigned long exit_qualification = vcpu->arch.exit_qualification;
if (vmx->nested.pml_full) {
vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
- exit_qualification &= INTR_INFO_UNBLOCK_NMI;
+
+ /*
+ * It should be impossible to trigger a nested PML Full VM-Exit
+ * for anything other than an EPT Violation from L2. KVM *can*
+ * trigger nEPT page fault injection in response to an EPT
+ * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT
+ * tables also changed, but KVM should not treat EPT Misconfig
+ * VM-Exits as writes.
+ */
+ WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION);
+
+ /*
+ * PML Full and EPT Violation VM-Exits both use bit 12 to report
+ * "NMI unblocking due to IRET", i.e. the bit can be propagated
+ * as-is from the original EXIT_QUALIFICATION.
+ */
+ exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI;
} else {
- if (fault->error_code & PFERR_RSVD_MASK)
+ if (fault->error_code & PFERR_RSVD_MASK) {
vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
- else
+ exit_qualification = 0;
+ } else {
+ exit_qualification = fault->exit_qualification;
+ exit_qualification |= vmx_get_exit_qual(vcpu) &
+ (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+ }
/*
* Although the caller (kvm_inject_emulated_page_fault) would
@@ -601,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
- struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
+ struct kvm_host_map map;
/* Nothing to do if the MSR bitmap is not in use. */
if (!cpu_has_vmx_msr_bitmap() ||
@@ -624,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
return true;
}
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
+ if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map))
return false;
- msr_bitmap_l1 = (unsigned long *)map->hva;
+ msr_bitmap_l1 = (unsigned long *)map.hva;
/*
* To keep the control flow simple, pay eight 8-byte writes (sixteen
@@ -691,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0,
MSR_IA32_FLUSH_CMD, MSR_TYPE_W);
- kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
+ kvm_vcpu_unmap(vcpu, &map);
vmx->nested.force_msr_bitmap_recalc = false;
@@ -958,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- if (kvm_set_msr(vcpu, e.index, e.value)) {
+ if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -994,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
- if (kvm_get_msr(vcpu, msr_index, data)) {
+ if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@@ -1089,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
- * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
- * instead of reading the value from the vmcs02 VMExit
- * MSR-store area.
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@@ -1174,11 +1197,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept);
/*
- * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
- * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a
- * full TLB flush from the guest's perspective. This is required even
- * if VPID is disabled in the host as KVM may need to synchronize the
- * MMU in response to the guest TLB flush.
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host, and so architecturally, linear and combined
+ * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM
+ * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2,
+ * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This
+ * is required if VPID is disabled in KVM, as a TLB flush (there are no
+ * VPIDs) still occurs from L1's perspective, and KVM may need to
+ * synchronize the MMU in response to the guest TLB flush.
*
* Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use.
* EPT is a special snowflake, as guest-physical mappings aren't
@@ -1228,21 +1254,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved =
- /* feature (except bit 48; see below) */
- BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
- /* reserved */
- BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ VMX_BASIC_INOUT |
+ VMX_BASIC_TRUE_CTLS;
+
+ const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ GENMASK_ULL(47, 45) |
+ BIT_ULL(31);
+
u64 vmx_basic = vmcs_config.nested.basic;
- if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ * inverted polarity), the incoming value must not set feature bits or
+ * reserved bits that aren't allowed/supported by KVM. Fields, i.e.
+ * multi-bit values, are explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
return -EINVAL;
/*
* KVM does not emulate a version of VMX that constrains physical
* addresses of VMX structures (e.g. VMCS) to 32-bits.
*/
- if (data & BIT_ULL(48))
+ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EINVAL;
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@ -1311,16 +1348,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved_bits =
- /* feature */
- BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
- BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
- /* reserved */
- GENMASK_ULL(13, 9) | BIT_ULL(31);
+ const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_SHUTDOWN |
+ VMX_MISC_ACTIVITY_WAIT_SIPI |
+ VMX_MISC_INTEL_PT |
+ VMX_MISC_RDMSR_IN_SMM |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMXOFF_BLOCK_SMI |
+ VMX_MISC_ZERO_LEN_INS;
+
+ const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
vmcs_config.nested.misc_high);
- if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * The incoming value must not set feature bits or reserved bits that
+ * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
+ * explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
return -EINVAL;
if ((vmx->nested.msrs.pinbased_ctls_high &
@@ -2039,7 +2089,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
- if (likely(!guest_cpuid_has_evmcs(vcpu)))
+ if (likely(!guest_cpu_cap_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
evmcs_gpa = nested_get_evmptr(vcpu);
@@ -2220,6 +2270,9 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
vmcs_write64(EPT_POINTER,
construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info));
+
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
vmcs_write64(VM_FUNCTION_CONTROL, 0);
@@ -2265,6 +2318,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
+ /*
+ * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the
+ * same VPID as the host. Emulate this behavior by using vpid01 for L2
+ * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter
+ * and VM-Exit are architecturally required to flush VPID=0, but *only*
+ * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the
+ * required flushes), but doing so would cause KVM to over-flush. E.g.
+ * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled,
+ * and then runs L2 X again, then KVM can and should retain TLB entries
+ * for VPID12=1.
+ */
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
@@ -2291,10 +2355,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
- if (nested_cpu_has_posted_intr(vmcs12))
+ if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- else
+ } else {
+ vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
pin_controls_set(vmx, exec_control);
/*
@@ -2400,7 +2466,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
exec_control |= VM_ENTRY_LOAD_IA32_EFER;
}
vm_entry_controls_set(vmx, exec_control);
@@ -2413,7 +2479,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
exec_control = __vm_exit_controls_get(vmcs01);
- if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
+ if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
else
exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
@@ -2444,6 +2510,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@ -2481,7 +2548,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmx->segment_cache.bitmask = 0;
+ vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2925,7 +2992,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
return -EINVAL;
#ifdef CONFIG_KVM_HYPERV
- if (guest_cpuid_has_evmcs(vcpu))
+ if (guest_cpu_cap_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
#endif
@@ -2943,6 +3010,17 @@ static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
return 0;
}
+static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12)
+{
+ /*
+ * Check that the given linear address is canonical after a VM exit
+ * from L2, based on HOST_CR4.LA57 value that will be loaded for L1.
+ */
+ u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48;
+
+ return !__is_canonical_address(la, l1_address_bits_on_exit);
+}
+
static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -2953,8 +3031,8 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3)))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
+ if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
+ CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
return -EINVAL;
if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
@@ -2988,12 +3066,12 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(vmcs12->host_ss_selector == 0 && !ia32e))
return -EINVAL;
- if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
- CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
+ if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) ||
+ CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) ||
+ CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12)))
return -EINVAL;
/*
@@ -3111,7 +3189,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
}
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
- (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
+ (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
return -EINVAL;
@@ -3209,7 +3287,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (guest_cpuid_has_evmcs(vcpu) &&
+ if (guest_cpu_cap_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@ -3364,7 +3442,7 @@ static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
if (!nested_cpu_has_pml(vmcs12))
return 0;
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) {
vmx->nested.pml_full = true;
return 1;
}
@@ -3403,14 +3481,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
-static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
-{
- u8 rvi = vmx_get_rvi();
- u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3430,7 +3500,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
enum vm_entry_failure_code entry_failure_code;
- bool evaluate_pending_interrupts;
union vmx_exit_reason exit_reason = {
.basic = EXIT_REASON_INVALID_STATE,
.failed_vmentry = 1,
@@ -3449,13 +3518,6 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
kvm_service_local_tlb_flush_requests(vcpu);
- evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
- if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
- evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- if (!evaluate_pending_interrupts)
- evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
-
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
@@ -3538,9 +3600,13 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
* when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
* effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
- * unconditionally.
+ * unconditionally. Take care to pull data from vmcs01 as appropriate,
+ * e.g. when checking for interrupt windows, as vmcs02 is now loaded.
*/
- if (unlikely(evaluate_pending_interrupts))
+ if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING)) ||
+ kvm_apic_has_pending_init_or_sipi(vcpu) ||
+ kvm_apic_has_interrupt(vcpu))
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
@@ -3606,7 +3672,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
return nested_vmx_failInvalid(vcpu);
@@ -3673,14 +3739,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
- /* Emulate processing of posted interrupts on VM-Enter. */
- if (nested_cpu_has_posted_intr(vmcs12) &&
- kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
- vmx->nested.pi_pending = true;
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
- }
-
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3874,8 +3932,8 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
return 0;
- max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
- if (max_irr != 256) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0) {
vapic_page = vmx->nested.virtual_apic_map.hva;
if (!vapic_page)
goto mmio_needed;
@@ -4006,10 +4064,46 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
-static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{
- return nested_vmx_preemption_timer_pending(vcpu) ||
- to_vmx(vcpu)->nested.mtf_pending;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ void *vapic = vmx->nested.virtual_apic_map.hva;
+ int max_irr, vppr;
+
+ if (nested_vmx_preemption_timer_pending(vcpu) ||
+ vmx->nested.mtf_pending)
+ return true;
+
+ /*
+ * Virtual Interrupt Delivery doesn't require manual injection. Either
+ * the interrupt is already in GUEST_RVI and will be recognized by CPU
+ * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move
+ * the interrupt from the PIR to RVI prior to entering the guest.
+ */
+ if (for_injection)
+ return false;
+
+ if (!nested_cpu_has_vid(get_vmcs12(vcpu)) ||
+ __vmx_interrupt_blocked(vcpu))
+ return false;
+
+ if (!vapic)
+ return false;
+
+ vppr = *((u32 *)(vapic + APIC_PROCPRI));
+
+ max_irr = vmx_get_rvi();
+ if ((max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+
+ if (vmx->nested.pi_pending && vmx->nested.pi_desc &&
+ pi_test_on(vmx->nested.pi_desc)) {
+ max_irr = pi_find_highest_vector(vmx->nested.pi_desc);
+ if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0))
+ return true;
+ }
+
+ return false;
}
/*
@@ -4106,13 +4200,25 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
*/
bool block_nested_exceptions = vmx->nested.nested_run_pending;
/*
- * New events (not exceptions) are only recognized at instruction
+ * Events that don't require injection, i.e. that are virtualized by
+ * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need
+ * to regain control in order to deliver the event, and hardware will
+ * handle event ordering, e.g. with respect to injected exceptions.
+ *
+ * But, new events (not exceptions) are only recognized at instruction
* boundaries. If an event needs reinjection, then KVM is handling a
- * VM-Exit that occurred _during_ instruction execution; new events are
- * blocked until the instruction completes.
+ * VM-Exit that occurred _during_ instruction execution; new events,
+ * irrespective of whether or not they're injected, are blocked until
+ * the instruction completes.
+ */
+ bool block_non_injected_events = kvm_event_needs_reinjection(vcpu);
+ /*
+ * Inject events are blocked by nested VM-Enter, as KVM is responsible
+ * for managing priority between concurrent events, i.e. KVM needs to
+ * wait until after VM-Enter completes to deliver injected events.
*/
bool block_nested_events = block_nested_exceptions ||
- kvm_event_needs_reinjection(vcpu);
+ block_non_injected_events;
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -4222,11 +4328,71 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
+ if (!nested_exit_on_intr(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ goto no_vmexit;
+ }
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ if (block_nested_events)
+ return -EBUSY;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ /*
+ * Nested posted interrupts are delivered via RVI, i.e.
+ * aren't injected by KVM, and so can be queued even if
+ * manual event injection is disallowed.
+ */
+ if (block_non_injected_events)
+ return -EBUSY;
+
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
if (block_nested_events)
return -EBUSY;
- if (!nested_exit_on_intr(vcpu))
- goto no_vmexit;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@@ -4433,7 +4599,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_dr7 = vcpu->arch.dr7;
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
vmcs12->guest_ia32_efer = vcpu->arch.efer;
@@ -4640,7 +4806,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
return vmcs_read64(GUEST_IA32_EFER);
if (cpu_has_load_ia32_efer())
- return host_efer;
+ return kvm_host.efer;
for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
@@ -4651,7 +4817,7 @@ static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
if (efer_msr)
return efer_msr->data;
- return host_efer;
+ return kvm_host.efer;
}
static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
@@ -4744,7 +4910,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- if (kvm_set_msr(vcpu, h.index, h.value)) {
+ if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4860,7 +5026,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
* doesn't isolate different VMCSs, i.e. in this case, doesn't provide
* separate modes for L2 vs L1.
*/
- if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL))
indirect_branch_prediction_barrier();
/* Update any VMCS fields that might have changed while L2 ran */
@@ -4883,11 +5049,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vmx_update_cpu_dirty_logging(vcpu);
}
- /* Unpin physical memory we referred to in vmcs02 */
- kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false);
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
+ nested_put_vmcs12_pages(vcpu);
if (vmx->nested.reload_vmcs01_apic_access_page) {
vmx->nested.reload_vmcs01_apic_access_page = false;
@@ -4899,6 +5061,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
}
+ if (vmx->nested.update_vmcs01_hwapic_isr) {
+ vmx->nested.update_vmcs01_hwapic_isr = false;
+ kvm_apic_update_hwapic_isr(vcpu);
+ }
+
if ((vm_exit_reason != -1) &&
(enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)))
vmx->nested.need_vmcs12_to_shadow_sync = true;
@@ -4907,14 +5074,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- nested_exit_intr_ack_set(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
@@ -4925,6 +5084,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
+ /*
+ * Process events if an injectable IRQ or NMI is pending, even
+ * if the event is blocked (RFLAGS.IF is cleared on VM-Exit).
+ * If an event became pending while L2 was active, KVM needs to
+ * either inject the event or request an IRQ/NMI window. SMIs
+ * don't need to be processed as SMM is mutually exclusive with
+ * non-root mode. INIT/SIPI don't need to be checked as INIT
+ * is blocked post-VMXON, and SIPIs are ignored.
+ */
+ if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return;
}
@@ -5031,7 +5201,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
- exn = is_noncanonical_address(*ret, vcpu);
+ exn = is_noncanonical_address(*ret, vcpu, 0);
} else {
/*
* When not in long mode, the virtual/linear address is
@@ -5828,6 +5998,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ /*
+ * Always flush the effective vpid02, i.e. never flush the current VPID
+ * and never explicitly flush vpid01. INVVPID targets a VPID, not a
+ * VMCS, and so whether or not the current vmcs12 has VPID enabled is
+ * irrelevant (and there may not be a loaded vmcs12).
+ */
vpid02 = nested_get_vpid02(vcpu);
switch (type) {
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
@@ -5836,7 +6012,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
* invalidation.
*/
if (!operand.vpid ||
- is_noncanonical_address(operand.gla, vcpu))
+ is_noncanonical_invlpg_address(operand.gla, vcpu))
return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid_sync_vcpu_addr(vpid02, operand.gla);
@@ -6130,7 +6306,7 @@ static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
{
u32 encls_leaf;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING))
return false;
@@ -6208,6 +6384,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
else if (is_alignment_check(intr_info) &&
!vmx_guest_inject_ac(vcpu))
return true;
+ else if (is_ve_fault(intr_info))
+ return true;
return false;
case EXIT_REASON_EXTERNAL_INTERRUPT:
return true;
@@ -6466,7 +6644,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
vmx = to_vmx(vcpu);
vmcs12 = get_vmcs12(vcpu);
- if (guest_can_use(vcpu, X86_FEATURE_VMX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) &&
(vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
@@ -6607,7 +6785,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
return -EINVAL;
} else {
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return -EINVAL;
if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
@@ -6641,7 +6819,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
- (!guest_can_use(vcpu, X86_FEATURE_VMX) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) ||
!vmx->nested.enlightened_vmcs_enabled))
return -EINVAL;
@@ -6987,7 +7165,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
@@ -7002,12 +7180,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- msrs->basic =
- VMCS12_REVISION |
- VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ X86_MEMTYPE_WB);
+ msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index cce4e2aa30fb..2c296b6abb8c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
@@ -109,7 +115,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.msrs.misc_low &
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
}
static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 315c7c2ba89b..77012b2eca0e 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -20,53 +20,19 @@
#include "nested.h"
#include "pmu.h"
-#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-
-enum intel_pmu_architectural_events {
- /*
- * The order of the architectural events matters as support for each
- * event is enumerated via CPUID using the index of the event.
- */
- INTEL_ARCH_CPU_CYCLES,
- INTEL_ARCH_INSTRUCTIONS_RETIRED,
- INTEL_ARCH_REFERENCE_CYCLES,
- INTEL_ARCH_LLC_REFERENCES,
- INTEL_ARCH_LLC_MISSES,
- INTEL_ARCH_BRANCHES_RETIRED,
- INTEL_ARCH_BRANCHES_MISPREDICTED,
-
- NR_REAL_INTEL_ARCH_EVENTS,
-
- /*
- * Pseudo-architectural event used to implement IA32_FIXED_CTR2, a.k.a.
- * TSC reference cycles. The architectural reference cycles event may
- * or may not actually use the TSC as the reference, e.g. might use the
- * core crystal clock or the bus clock (yeah, "architectural").
- */
- PSEUDO_ARCH_REFERENCE_CYCLES = NR_REAL_INTEL_ARCH_EVENTS,
- NR_INTEL_ARCH_EVENTS,
-};
+/*
+ * Perf's "BASE" is wildly misleading, architectural PMUs use bits 31:16 of ECX
+ * to encode the "type" of counter to read, i.e. this is not a "base". And to
+ * further confuse things, non-architectural PMUs use bit 31 as a flag for
+ * "fast" reads, whereas the "type" is an explicit value.
+ */
+#define INTEL_RDPMC_GP 0
+#define INTEL_RDPMC_FIXED INTEL_PMC_FIXED_RDPMC_BASE
-static struct {
- u8 eventsel;
- u8 unit_mask;
-} const intel_arch_events[] = {
- [INTEL_ARCH_CPU_CYCLES] = { 0x3c, 0x00 },
- [INTEL_ARCH_INSTRUCTIONS_RETIRED] = { 0xc0, 0x00 },
- [INTEL_ARCH_REFERENCE_CYCLES] = { 0x3c, 0x01 },
- [INTEL_ARCH_LLC_REFERENCES] = { 0x2e, 0x4f },
- [INTEL_ARCH_LLC_MISSES] = { 0x2e, 0x41 },
- [INTEL_ARCH_BRANCHES_RETIRED] = { 0xc4, 0x00 },
- [INTEL_ARCH_BRANCHES_MISPREDICTED] = { 0xc5, 0x00 },
- [PSEUDO_ARCH_REFERENCE_CYCLES] = { 0x00, 0x03 },
-};
+#define INTEL_RDPMC_TYPE_MASK GENMASK(31, 16)
+#define INTEL_RDPMC_INDEX_MASK GENMASK(15, 0)
-/* mapping between fixed pmc index and intel_arch_events array */
-static int fixed_pmc_events[] = {
- [0] = INTEL_ARCH_INSTRUCTIONS_RETIRED,
- [1] = INTEL_ARCH_CPU_CYCLES,
- [2] = PSEUDO_ARCH_REFERENCE_CYCLES,
-};
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
{
@@ -84,83 +50,67 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
- __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
+ __set_bit(KVM_FIXED_PMC_BASE_IDX + i, pmu->pmc_in_use);
kvm_pmu_request_counter_reprogram(pmc);
}
}
-static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
-{
- if (pmc_idx < INTEL_PMC_IDX_FIXED) {
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
- MSR_P6_EVNTSEL0);
- } else {
- u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
-
- return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
- }
-}
-
-static bool intel_hw_event_available(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
- u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(intel_arch_events) != NR_INTEL_ARCH_EVENTS);
-
- /*
- * Disallow events reported as unavailable in guest CPUID. Note, this
- * doesn't apply to pseudo-architectural events.
- */
- for (i = 0; i < NR_REAL_INTEL_ARCH_EVENTS; i++) {
- if (intel_arch_events[i].eventsel != event_select ||
- intel_arch_events[i].unit_mask != unit_mask)
- continue;
-
- return pmu->available_event_types & BIT(i);
- }
-
- return true;
-}
-
-static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
-{
- struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
-
- idx &= ~(3u << 30);
-
- return fixed ? idx < pmu->nr_arch_fixed_counters
- : idx < pmu->nr_arch_gp_counters;
-}
-
static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
unsigned int idx, u64 *mask)
{
+ unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
unsigned int num_counters;
+ u64 bitmask;
+
+ /*
+ * The encoding of ECX for RDPMC is different for architectural versus
+ * non-architecturals PMUs (PMUs with version '0'). For architectural
+ * PMUs, bits 31:16 specify the PMC type and bits 15:0 specify the PMC
+ * index. For non-architectural PMUs, bit 31 is a "fast" flag, and
+ * bits 30:0 specify the PMC index.
+ *
+ * Yell and reject attempts to read PMCs for a non-architectural PMU,
+ * as KVM doesn't support such PMUs.
+ */
+ if (WARN_ON_ONCE(!pmu->version))
+ return NULL;
- idx &= ~(3u << 30);
- if (fixed) {
+ /*
+ * General Purpose (GP) PMCs are supported on all PMUs, and fixed PMCs
+ * are supported on all architectural PMUs, i.e. on all virtual PMUs
+ * supported by KVM. Note, KVM only emulates fixed PMCs for PMU v2+,
+ * but the type itself is still valid, i.e. let RDPMC fail due to
+ * accessing a non-existent counter. Reject attempts to read all other
+ * types, which are unknown/unsupported.
+ */
+ switch (type) {
+ case INTEL_RDPMC_FIXED:
counters = pmu->fixed_counters;
num_counters = pmu->nr_arch_fixed_counters;
- } else {
+ bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
+ break;
+ case INTEL_RDPMC_GP:
counters = pmu->gp_counters;
num_counters = pmu->nr_arch_gp_counters;
+ bitmask = pmu->counter_bitmask[KVM_PMC_GP];
+ break;
+ default:
+ return NULL;
}
+
+ idx &= INTEL_RDPMC_INDEX_MASK;
if (idx >= num_counters)
return NULL;
- *mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
+
+ *mask &= bitmask;
return &counters[array_index_nospec(idx, num_counters)];
}
static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return 0;
return vcpu->arch.perf_capabilities;
@@ -210,7 +160,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT;
break;
case MSR_IA32_DS_AREA:
- ret = guest_cpuid_has(vcpu, X86_FEATURE_DS);
+ ret = guest_cpu_cap_has(vcpu, X86_FEATURE_DS);
break;
case MSR_PEBS_DATA_CFG:
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
@@ -398,14 +348,14 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (data & pmu->fixed_ctr_ctrl_mask)
+ if (data & pmu->fixed_ctr_ctrl_rsvd)
return 1;
if (pmu->fixed_ctr_ctrl != data)
reprogram_fixed_counters(pmu, data);
break;
case MSR_IA32_PEBS_ENABLE:
- if (data & pmu->pebs_enable_mask)
+ if (data & pmu->pebs_enable_rsvd)
return 1;
if (pmu->pebs_enable != data) {
@@ -415,13 +365,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
break;
case MSR_IA32_DS_AREA:
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
pmu->ds_area = data;
break;
case MSR_PEBS_DATA_CFG:
- if (data & pmu->pebs_data_cfg_mask)
+ if (data & pmu->pebs_data_cfg_rsvd)
return 1;
pmu->pebs_data_cfg = data;
@@ -464,20 +414,46 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+/*
+ * Map fixed counter events to architectural general purpose event encodings.
+ * Perf doesn't provide APIs to allow KVM to directly program a fixed counter,
+ * and so KVM instead programs the architectural event to effectively request
+ * the fixed counter. Perf isn't guaranteed to use a fixed counter and may
+ * instead program the encoding into a general purpose counter, e.g. if a
+ * different perf_event is already utilizing the requested counter, but the end
+ * result is the same (ignoring the fact that using a general purpose counter
+ * will likely exacerbate counter contention).
+ *
+ * Forcibly inlined to allow asserting on @index at build time, and there should
+ * never be more than one user.
+ */
+static __always_inline u64 intel_get_fixed_pmc_eventsel(unsigned int index)
{
- int i;
+ const enum perf_hw_id fixed_pmc_perf_ids[] = {
+ [0] = PERF_COUNT_HW_INSTRUCTIONS,
+ [1] = PERF_COUNT_HW_CPU_CYCLES,
+ [2] = PERF_COUNT_HW_REF_CPU_CYCLES,
+ };
+ u64 eventsel;
- BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_events) != KVM_PMC_MAX_FIXED);
+ BUILD_BUG_ON(ARRAY_SIZE(fixed_pmc_perf_ids) != KVM_MAX_NR_INTEL_FIXED_COUTNERS);
+ BUILD_BUG_ON(index >= KVM_MAX_NR_INTEL_FIXED_COUTNERS);
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- int index = array_index_nospec(i, KVM_PMC_MAX_FIXED);
- struct kvm_pmc *pmc = &pmu->fixed_counters[index];
- u32 event = fixed_pmc_events[index];
+ /*
+ * Yell if perf reports support for a fixed counter but perf doesn't
+ * have a known encoding for the associated general purpose event.
+ */
+ eventsel = perf_get_hw_event_config(fixed_pmc_perf_ids[index]);
+ WARN_ON_ONCE(!eventsel && index < kvm_pmu_cap.num_counters_fixed);
+ return eventsel;
+}
- pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
- intel_arch_events[event].eventsel;
- }
+static void intel_pmu_enable_fixed_counter_bits(struct kvm_pmu *pmu, u64 bits)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_rsvd &= ~intel_fixed_bits_by_idx(i, bits);
}
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
@@ -488,21 +464,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
union cpuid10_eax eax;
union cpuid10_edx edx;
u64 perf_capabilities;
- u64 counter_mask;
- int i;
-
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
- pmu->raw_event_mask = X86_RAW_EVENT_MASK;
- pmu->global_ctrl_mask = ~0ull;
- pmu->global_status_mask = ~0ull;
- pmu->fixed_ctr_ctrl_mask = ~0ull;
- pmu->pebs_enable_mask = ~0ull;
- pmu->pebs_data_cfg_mask = ~0ull;
+ u64 counter_rsvd;
memset(&lbr_desc->records, 0, sizeof(lbr_desc->records));
@@ -515,8 +477,9 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
return;
entry = kvm_find_cpuid_entry(vcpu, 0xa);
- if (!entry || !vcpu->kvm->arch.enable_pmu)
+ if (!entry)
return;
+
eax.full = entry->eax;
edx.full = entry->edx;
@@ -543,25 +506,26 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
kvm_pmu_cap.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
- setup_fixed_pmc_eventsel(pmu);
}
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
- pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
- counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED));
- pmu->global_ctrl_mask = counter_mask;
+ intel_pmu_enable_fixed_counter_bits(pmu, INTEL_FIXED_0_KERNEL |
+ INTEL_FIXED_0_USER |
+ INTEL_FIXED_0_ENABLE_PMI);
+
+ counter_rsvd = ~(((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << KVM_FIXED_PMC_BASE_IDX));
+ pmu->global_ctrl_rsvd = counter_rsvd;
/*
* GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET)
* share reserved bit definitions. The kernel just happens to use
* OVF_CTRL for the names.
*/
- pmu->global_status_mask = pmu->global_ctrl_mask
+ pmu->global_status_rsvd = pmu->global_ctrl_rsvd
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
if (vmx_pt_mode_is_host_guest())
- pmu->global_status_mask &=
+ pmu->global_status_rsvd &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
entry = kvm_find_cpuid_entry_index(vcpu, 7, 0);
@@ -580,7 +544,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
perf_capabilities = vcpu_get_perf_capabilities(vcpu);
if (cpuid_model_is_consistent(vcpu) &&
(perf_capabilities & PMU_CAP_LBR_FMT))
- x86_perf_get_lbr(&lbr_desc->records);
+ memcpy(&lbr_desc->records, &vmx_lbr_caps, sizeof(vmx_lbr_caps));
else
lbr_desc->records.nr = 0;
@@ -589,15 +553,12 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (perf_capabilities & PERF_CAP_PEBS_FORMAT) {
if (perf_capabilities & PERF_CAP_PEBS_BASELINE) {
- pmu->pebs_enable_mask = counter_mask;
+ pmu->pebs_enable_rsvd = counter_rsvd;
pmu->reserved_bits &= ~ICL_EVENTSEL_ADAPTIVE;
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- pmu->fixed_ctr_ctrl_mask &=
- ~(1ULL << (INTEL_PMC_IDX_FIXED + i * 4));
- }
- pmu->pebs_data_cfg_mask = ~0xff00000full;
+ pmu->pebs_data_cfg_rsvd = ~0xff00000full;
+ intel_pmu_enable_fixed_counter_bits(pmu, ICL_FIXED_0_ADAPTIVE);
} else {
- pmu->pebs_enable_mask =
+ pmu->pebs_enable_rsvd =
~((1ull << pmu->nr_arch_gp_counters) - 1);
}
}
@@ -609,18 +570,19 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu);
- for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_GP_COUNTERS; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
pmu->gp_counters[i].current_config = 0;
}
- for (i = 0; i < KVM_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < KVM_MAX_NR_INTEL_FIXED_COUTNERS; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].idx = i + KVM_FIXED_PMC_BASE_IDX;
pmu->fixed_counters[i].current_config = 0;
+ pmu->fixed_counters[i].eventsel = intel_get_fixed_pmc_eventsel(i);
}
lbr_desc->records.nr = 0;
@@ -748,11 +710,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
struct kvm_pmc *pmc = NULL;
int bit, hw_idx;
- for_each_set_bit(bit, (unsigned long *)&pmu->global_ctrl,
- X86_PMC_IDX_MAX) {
- pmc = intel_pmc_idx_to_pmc(pmu, bit);
-
- if (!pmc || !pmc_speculative_in_use(pmc) ||
+ kvm_for_each_pmc(pmu, pmc, bit, (unsigned long *)&pmu->global_ctrl) {
+ if (!pmc_speculative_in_use(pmc) ||
!pmc_is_globally_enabled(pmc) || !pmc->perf_event)
continue;
@@ -767,11 +726,8 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu)
}
struct kvm_pmu_ops intel_pmu_ops __initdata = {
- .hw_event_available = intel_hw_event_available,
- .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
@@ -781,6 +737,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
.EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
- .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
+ .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS,
.MIN_NR_GP_COUNTERS = 1,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index af662312fd07..ec08fa3caf43 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
* handle task migration (@cpu != vcpu->cpu).
*/
new.ndst = dest;
- new.sn = 0;
+ __pi_clear_sn(&new);
/*
* Restore the notification vector; in the blocking case, the
@@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
&per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
+ WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking");
old.control = READ_ONCE(pi_desc->control);
do {
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 26992076552e..ad9116a99bcc 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -2,97 +2,8 @@
#ifndef __KVM_X86_VMX_POSTED_INTR_H
#define __KVM_X86_VMX_POSTED_INTR_H
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-#define PID_TABLE_ENTRY_VALID 1
-
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+#include <linux/bitmap.h>
+#include <asm/posted_intr.h>
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
@@ -103,4 +14,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void vmx_pi_start_assignment(struct kvm *kvm);
+static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
+{
+ int vec;
+
+ vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ return vec < 256 ? vec : -1;
+}
+
#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6fef01e0536e..9961e07cf071 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -4,12 +4,11 @@
#include <asm/sgx.h>
-#include "cpuid.h"
+#include "x86.h"
#include "kvm_cache_regs.h"
#include "nested.h"
#include "sgx.h"
#include "vmx.h"
-#include "x86.h"
bool __read_mostly enable_sgx = 1;
module_param_named(sgx, enable_sgx, bool, 0444);
@@ -38,7 +37,7 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
fault = true;
} else if (likely(is_64_bit_mode(vcpu))) {
*gva = vmx_get_untagged_addr(vcpu, *gva, 0);
- fault = is_noncanonical_address(*gva, vcpu);
+ fault = is_noncanonical_address(*gva, vcpu, 0);
} else {
*gva &= 0xffffffff;
fault = (s.unusable) ||
@@ -123,7 +122,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
* likely than a bad userspace address.
*/
if ((trapnr == PF_VECTOR || !boot_cpu_has(X86_FEATURE_SGX2)) &&
- guest_cpuid_has(vcpu, X86_FEATURE_SGX2)) {
+ guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2)) {
memset(&ex, 0, sizeof(ex));
ex.vector = PF_VECTOR;
ex.error_code = PFERR_PRESENT_MASK | PFERR_WRITE_MASK |
@@ -274,7 +273,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
- contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;
@@ -366,7 +365,7 @@ static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf)
return true;
if (leaf >= EAUG && leaf <= EMODT)
- return guest_cpuid_has(vcpu, X86_FEATURE_SGX2);
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2);
return false;
}
@@ -382,8 +381,8 @@ int handle_encls(struct kvm_vcpu *vcpu)
{
u32 leaf = (u32)kvm_rax_read(vcpu);
- if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) ||
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (!enable_sgx || !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) ||
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
kvm_queue_exception(vcpu, UD_VECTOR);
} else if (!encls_leaf_enabled_in_guest(vcpu, leaf) ||
!sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) {
@@ -480,15 +479,15 @@ void vmx_write_encls_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (!cpu_has_vmx_encls_vmexit())
return;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX) &&
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) &&
sgx_enabled_in_guest_bios(vcpu)) {
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX1)) {
bitmap &= ~GENMASK_ULL(ETRACK, ECREATE);
if (sgx_intercept_encls_ecreate(vcpu))
bitmap |= (1 << ECREATE);
}
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX2))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
bitmap &= ~GENMASK_ULL(EMODT, EAUG);
/*
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 01936013428b..56fd150a6f24 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -188,12 +188,13 @@ struct __packed vmcs12 {
};
/*
- * VMCS12_REVISION is an arbitrary id that should be changed if the content or
- * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
- * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ * VMCS12_REVISION is KVM's arbitrary ID for the layout of struct vmcs12. KVM
+ * enumerates this value to L1 via MSR_IA32_VMX_BASIC, and checks the revision
+ * ID during nested VMPTRLD to verify that L1 is loading a VMCS that adhere's
+ * to KVM's virtual CPU definition.
*
- * IMPORTANT: Changing this value will break save/restore compatibility with
- * older kvm releases.
+ * DO NOT change this value, as it will break save/restore compatibility with
+ * older KVM releases.
*/
#define VMCS12_REVISION 0x11e57ed0
@@ -206,7 +207,8 @@ struct __packed vmcs12 {
#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE
/*
- * For save/restore compatibility, the vmcs12 field offsets must not change.
+ * For save/restore compatibility, the vmcs12 field offsets must not change,
+ * although appending fields and/or filling gaps is obviously allowed.
*/
#define CHECK_OFFSET(field, loc) \
ASSERT_STRUCT_OFFSET(struct vmcs12, field, loc)
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 2bfbf758d061..f6986dee6f8c 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -275,6 +275,8 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL)
call vmx_spec_ctrl_restore_host
+ CLEAR_BRANCH_HISTORY_VMEXIT
+
/* Put return value in AX */
mov %_ASM_BX, %_ASM_AX
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 88a4ff200d04..3b92f893b239 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -38,6 +38,7 @@
#include <asm/desc.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
+#include <asm/fred.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
@@ -49,6 +50,8 @@
#include <asm/spec-ctrl.h>
#include <asm/vmx.h>
+#include <trace/events/ipi.h>
+
#include "capabilities.h"
#include "cpuid.h"
#include "hyperv.h"
@@ -65,10 +68,13 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
+#include "posted_intr.h"
MODULE_AUTHOR("Qumranet");
+MODULE_DESCRIPTION("KVM support for VMX (Intel VT-x) extensions");
MODULE_LICENSE("GPL");
#ifdef MODULE
@@ -159,7 +165,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
/*
* List of MSRs that can be directly passed to the guest.
- * In addition to these x2apic and PT MSRs are handled specially.
+ * In addition to these x2apic, PT and LBR MSRs are handled specially.
*/
static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_IA32_SPEC_CTRL,
@@ -211,9 +217,13 @@ module_param(ple_window_shrink, uint, 0444);
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
-/* Default is SYSTEM mode, 1 for host-guest mode */
+/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
+#ifdef CONFIG_BROKEN
module_param(pt_mode, int, S_IRUGO);
+#endif
+
+struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
@@ -252,7 +262,7 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
return 0;
}
- if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
+ if (kvm_host.arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
@@ -397,7 +407,7 @@ static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
* and VM-Exit.
*/
vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
- (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
+ (kvm_host.arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
!boot_cpu_has_bug(X86_BUG_MDS) &&
!boot_cpu_has_bug(X86_BUG_TAA);
@@ -473,10 +483,9 @@ noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
ext, vpid, gva);
}
-noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
+noinline void invept_error(unsigned long ext, u64 eptp)
{
- vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
- ext, eptp, gpa);
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx\n", ext, eptp);
}
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
@@ -517,16 +526,10 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
-static struct kvm_x86_ops vmx_x86_ops __initdata;
-
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -576,9 +579,8 @@ static __init void hv_init_evmcs(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
+ vt_x86_ops.enable_l2_tlb_flush
= hv_enable_l2_tlb_flush;
-
} else {
enlightened_vmcs = false;
}
@@ -667,25 +669,14 @@ static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
-static int possible_passthrough_msr_slot(u32 msr)
+static int vmx_get_passthrough_msr_slot(u32 msr)
{
- u32 i;
-
- for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
- if (vmx_possible_passthrough_msrs[i] == msr)
- return i;
-
- return -ENOENT;
-}
-
-static bool is_valid_passthrough_msr(u32 msr)
-{
- bool r;
+ int i;
switch (msr) {
case 0x800 ... 0x8ff:
/* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
- return true;
+ return -ENOENT;
case MSR_IA32_RTIT_STATUS:
case MSR_IA32_RTIT_OUTPUT_BASE:
case MSR_IA32_RTIT_OUTPUT_MASK:
@@ -700,14 +691,16 @@ static bool is_valid_passthrough_msr(u32 msr)
case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8:
case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8:
/* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */
- return true;
+ return -ENOENT;
}
- r = possible_passthrough_msr_slot(msr) != -ENOENT;
-
- WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+ }
- return r;
+ WARN(1, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+ return -ENOENT;
}
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
@@ -759,7 +752,7 @@ fault:
return -EIO;
}
-static void vmx_emergency_disable(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@@ -878,6 +871,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -1122,12 +1121,12 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
- (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
+ (enable_ept && ((vmx->vcpu.arch.efer ^ kvm_host.efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- if (guest_efer != host_efer)
+ if (guest_efer != kvm_host.efer)
add_atomic_switch_msr(vmx, MSR_EFER,
- guest_efer, host_efer, false);
+ guest_efer, kvm_host.efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
@@ -1140,7 +1139,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx)
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
+ guest_efer |= kvm_host.efer & ignore_bits;
vmx->guest_uret_msrs[i].data = guest_efer;
vmx->guest_uret_msrs[i].mask = ~ignore_bits;
@@ -1290,8 +1289,6 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
u16 fs_sel, gs_sel;
int i;
- vmx->req_immediate_exit = false;
-
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
@@ -1412,6 +1409,38 @@ static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
}
#endif
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old, ple_window,
+ ple_window_grow,
+ ple_window_max);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old, ple_window,
+ ple_window_shrink,
+ ple_window);
+
+ if (vmx->ple_window != old) {
+ vmx->ple_window_dirty = true;
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ vmx->ple_window, old);
+ }
+}
+
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1483,18 +1512,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
+ shrink_ple_window(vcpu);
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
-
- vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1553,7 +1581,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
@@ -1604,7 +1632,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
- ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
+ (data & RTIT_CTL_TRACEEN) &&
+ data != vmx->pt_desc.guest.ctl)
return 1;
/*
@@ -1659,8 +1688,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1673,6 +1702,12 @@ static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
kvm_queue_exception(vcpu, UD_VECTOR);
return X86EMUL_PROPAGATE_FAULT;
}
+
+ /* Check that emulation is possible during event vectoring */
+ if ((to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ !kvm_can_emulate_event_vectoring(emul_type))
+ return X86EMUL_UNHANDLEABLE_VECTORING;
+
return X86EMUL_CONTINUE;
}
@@ -1744,7 +1779,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1775,7 +1810,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
}
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1794,7 +1829,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
@@ -1876,8 +1911,8 @@ static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx));
vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID));
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(&vmx->vcpu, X86_FEATURE_RDPID));
/*
* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
@@ -1915,12 +1950,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
@@ -1963,15 +1998,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
+ switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
@@ -1980,7 +2015,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2030,7 +2065,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2046,13 +2081,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
return 1;
msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
break;
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data))
@@ -2065,7 +2100,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
- if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ if (!msr_info->host_initiated && guest_cpu_cap_has_evmcs(vcpu))
nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
#endif
@@ -2135,7 +2170,7 @@ static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
u64 data)
{
#ifdef CONFIG_X86_64
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return (u32)data;
#endif
return (unsigned long)data;
@@ -2146,7 +2181,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
u64 debugctl = 0;
if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
- (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
+ (host_initiated || guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT;
if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
@@ -2161,7 +2196,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2250,9 +2285,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_MPX)))
return 1;
- if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
+ if (is_noncanonical_msr_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
@@ -2352,7 +2387,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* behavior, but it's close enough.
*/
if (!msr_info->host_initiated &&
- (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+ (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC) ||
((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
return 1;
@@ -2362,7 +2397,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
- if (!guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_RTIT_CTL:
@@ -2417,7 +2452,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
- if (is_noncanonical_address(data, vcpu))
+ if (is_noncanonical_msr_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
@@ -2425,8 +2460,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
case MSR_IA32_PERF_CAPABILITIES:
- if (data && !vcpu_to_pmu(vcpu)->version)
- return 1;
if (data & PMU_CAP_LBR_FMT) {
if ((data & PMU_CAP_LBR_FMT) !=
(kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
@@ -2438,9 +2471,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((data & PERF_CAP_PEBS_MASK) !=
(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DS))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_DTES64))
return 1;
if (!cpuid_model_is_consistent(vcpu))
return 1;
@@ -2464,7 +2497,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2518,30 +2551,6 @@ static bool cpu_has_sgx(void)
return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
-/*
- * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
- * can't be used due to errata where VM Exit may incorrectly clear
- * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
- * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
- */
-static bool cpu_has_perf_global_ctrl_bug(void)
-{
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
- case INTEL_FAM6_NEHALEM: /* AAP115 */
- case INTEL_FAM6_WESTMERE: /* AAT100 */
- case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
- case INTEL_FAM6_NEHALEM_EX: /* BA97 */
- return true;
- default:
- break;
- }
- }
-
- return false;
-}
-
static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -2572,13 +2581,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
u64 misc_msr;
int i;
@@ -2612,6 +2621,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2636,6 +2648,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -2697,29 +2710,50 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
_vmexit_control &= ~x_ctrl;
}
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ /*
+ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
+ * can't be used due to an errata where VM Exit may incorrectly clear
+ * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
+ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ */
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_NEHALEM_EP: /* AAK155 */
+ case INTEL_NEHALEM: /* AAP115 */
+ case INTEL_WESTMERE: /* AAT100 */
+ case INTEL_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_NEHALEM_EX: /* BA97 */
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+
+ rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
-
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@ -2765,7 +2799,7 @@ static bool kvm_is_vmx_supported(void)
return supported;
}
-static int vmx_check_processor_compat(void)
+int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
@@ -2807,7 +2841,7 @@ fault:
return -EFAULT;
}
-static int vmx_hardware_enable(void)
+int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2831,9 +2865,6 @@ static int vmx_hardware_enable(void)
return r;
}
- if (enable_ept)
- ept_sync_global();
-
return 0;
}
@@ -2847,7 +2878,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void vmx_hardware_disable(void)
+void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@@ -2869,13 +2900,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -2968,7 +2999,7 @@ static __init int alloc_kvm_area(void)
* physical CPU.
*/
if (kvm_is_using_evmcs())
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -3161,7 +3192,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3186,12 +3217,12 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu)))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 root_hpa = mmu->root.hpa;
@@ -3207,7 +3238,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
@@ -3216,7 +3247,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
@@ -3261,7 +3292,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
-static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (is_guest_mode(vcpu))
return nested_guest_cr0_valid(vcpu, cr0);
@@ -3382,8 +3413,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3412,8 +3442,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
@@ -3529,7 +3558,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3540,16 +3569,29 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
-int vmx_get_cpl(struct kvm_vcpu *vcpu)
+static int __vmx_get_cpl(struct kvm_vcpu *vcpu, bool no_cache)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ar;
if (unlikely(vmx->rmode.vm86_active))
return 0;
- else {
- int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
- return VMX_AR_DPL(ar);
- }
+
+ if (no_cache)
+ ar = vmcs_read32(GUEST_SS_AR_BYTES);
+ else
+ ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
+ return VMX_AR_DPL(ar);
+}
+
+int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, false);
+}
+
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu)
+{
+ return __vmx_get_cpl(vcpu, true);
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
@@ -3606,14 +3648,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}
-static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3621,25 +3663,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -3963,6 +4005,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3972,16 +4015,13 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filters change.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- clear_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- clear_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
}
if ((type & MSR_TYPE_R) &&
@@ -4007,6 +4047,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ int idx;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -4016,16 +4057,13 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
/*
* Mark the desired intercept state in shadow bitmap, this is needed
* for resync when the MSR filter changes.
- */
- if (is_valid_passthrough_msr(msr)) {
- int idx = possible_passthrough_msr_slot(msr);
-
- if (idx != -ENOENT) {
- if (type & MSR_TYPE_R)
- set_bit(idx, vmx->shadow_msr_intercept.read);
- if (type & MSR_TYPE_W)
- set_bit(idx, vmx->shadow_msr_intercept.write);
- }
+ */
+ idx = vmx_get_passthrough_msr_slot(msr);
+ if (idx >= 0) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
}
if (type & MSR_TYPE_R)
@@ -4111,31 +4149,14 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- void *vapic_page;
- u32 vppr;
- int rvi;
-
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)) ||
- !nested_cpu_has_vid(get_vmcs12(vcpu)) ||
- WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
- return false;
-
- rvi = vmx_get_rvi();
-
- vapic_page = vmx->nested.virtual_apic_map.hva;
- vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
-
- return ((rvi & 0xf0) > (vppr & 0xf0));
-}
-
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
/*
* Redo intercept permissions for MSRs that KVM is passing through to
* the guest. Disabling interception will check the new MSR filter and
@@ -4208,6 +4229,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -4272,8 +4300,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
-static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -4357,7 +4385,7 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
}
if (cpu_has_load_ia32_efer())
- vmcs_write64(HOST_IA32_EFER, host_efer);
+ vmcs_write64(HOST_IA32_EFER, kvm_host.efer);
}
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
@@ -4406,9 +4434,6 @@ static u32 vmx_vmentry_ctrl(void)
VM_ENTRY_LOAD_IA32_EFER |
VM_ENTRY_IA32E_MODE);
- if (cpu_has_perf_global_ctrl_bug())
- vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
-
return vmentry_ctrl;
}
@@ -4426,16 +4451,12 @@ static u32 vmx_vmexit_ctrl(void)
if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
-
- if (cpu_has_perf_global_ctrl_bug())
- vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
-
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4545,7 +4566,8 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
* Update the nested MSR settings so that a nested VMM can/can't set
* controls for features that are/aren't exposed to the guest.
*/
- if (nested) {
+ if (nested &&
+ kvm_check_has_quirk(vmx->vcpu.kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
/*
* All features that can be added or removed to VMX MSRs must
* be supported in the first place for nested virtualization.
@@ -4571,10 +4593,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
bool __enabled; \
\
if (cpu_has_vmx_##name()) { \
- if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
- __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
- else \
- __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
+ __enabled = guest_cpu_cap_has(__vcpu, X86_FEATURE_##feat_name); \
vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
__enabled, exiting); \
} \
@@ -4601,6 +4620,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4649,8 +4669,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
*/
if (cpu_has_vmx_rdtscp()) {
bool rdpid_or_rdtscp_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) ||
- guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID);
vmx_adjust_secondary_exec_control(vmx, &exec_control,
SECONDARY_EXEC_ENABLE_RDTSCP,
@@ -4699,7 +4719,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
return 0;
}
-static int vmx_vcpu_precreate(struct kvm *kvm)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
return vmx_alloc_ipiv_pid_table(kvm);
}
@@ -4724,8 +4744,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -4796,7 +4820,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (enable_pml) {
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
vmx_write_encls_bitmap(&vmx->vcpu, NULL);
@@ -4830,7 +4854,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
init_vmcs(vmx);
- if (nested)
+ if (nested &&
+ kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
vcpu_setup_sgx_lepubkeyhash(vcpu);
@@ -4843,7 +4868,8 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
#endif
- vcpu->arch.microcode_version = 0x100000000ULL;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS))
+ vcpu->arch.microcode_version = 0x100000000ULL;
vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
/*
@@ -4851,10 +4877,10 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
* or POSTED_INTR_WAKEUP_VECTOR.
*/
vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
+ __pi_set_sn(&vmx->pi_desc);
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4869,9 +4895,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- vmx_segment_cache_clear(vmx);
- kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
-
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
@@ -4898,6 +4921,9 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
+ vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
+
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -4913,12 +4939,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4929,7 +4955,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
@@ -4957,7 +4983,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5035,7 +5061,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5047,17 +5073,22 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_nmi_blocked(vcpu);
}
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
return false;
- return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return __vmx_interrupt_blocked(vcpu);
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5072,7 +5103,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -5092,7 +5123,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -5213,6 +5244,16 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (WARN_ON_ONCE(is_ve_fault(intr_info))) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+
+ WARN_ONCE(ve_info->exit_reason != EXIT_REASON_EPT_VIOLATION,
+ "Unexpected #VE on VM-Exit reason 0x%x", ve_info->exit_reason);
+ dump_vmcs(vcpu);
+ kvm_mmu_print_sptes(vcpu, ve_info->guest_physical_address, "#VE");
+ return 1;
+ }
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5378,8 +5419,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -5575,10 +5615,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
- unsigned long val;
-
- kvm_get_dr(vcpu, dr, &val);
- kvm_register_write(vcpu, reg, val);
+ kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr));
err = 0;
} else {
err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg));
@@ -5588,7 +5625,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5607,7 +5644,13 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ lockdep_assert_irqs_disabled();
+ set_debugreg(vcpu->arch.dr6, 6);
+}
+
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5777,10 +5820,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
-
- vcpu->arch.exit_qualification = exit_qualification;
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@@ -5878,7 +5920,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
@@ -5888,38 +5930,6 @@ static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
return 1;
}
-static void grow_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __grow_ple_window(old, ple_window,
- ple_window_grow,
- ple_window_max);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
-static void shrink_ple_window(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned int old = vmx->ple_window;
-
- vmx->ple_window = __shrink_ple_window(old, ple_window,
- ple_window_shrink,
- ple_window);
-
- if (vmx->ple_window != old) {
- vmx->ple_window_dirty = true;
- trace_kvm_ple_window_update(vcpu->vcpu_id,
- vmx->ple_window, old);
- }
-}
-
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -5955,7 +5965,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu)
} operand;
int gpr_index;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_INVPCID)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6000,28 +6010,52 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
-static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
- kvm_lapic_expired_hv_timer(vcpu);
+ /*
+ * In the *extremely* unlikely scenario that this is a spurious VM-Exit
+ * due to the timer expiring while it was "soft" disabled, just eat the
+ * exit and re-enter the guest.
+ */
+ if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
return EXIT_FASTPATH_REENTER_GUEST;
- }
- return EXIT_FASTPATH_NONE;
+ /*
+ * If the timer expired because KVM used it to force an immediate exit,
+ * then mission accomplished.
+ */
+ if (force_immediate_exit)
+ return EXIT_FASTPATH_EXIT_HANDLED;
+
+ /*
+ * If L2 is active, go down the slow path as emulating the guest timer
+ * expiration likely requires synthesizing a nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
+
+ kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
}
static int handle_preemption_timer(struct kvm_vcpu *vcpu)
{
- handle_fastpath_preemption_timer(vcpu);
+ /*
+ * This non-fastpath handler is reached if and only if the preemption
+ * timer was being used to emulate a guest timer while L2 is active.
+ * All other scenarios are supposed to be handled in the fastpath.
+ */
+ WARN_ON_ONCE(!is_guest_mode(vcpu));
+ kvm_lapic_expired_hv_timer(vcpu);
return 1;
}
/*
* When nested=0, all VMX instruction VM Exits filter here. The handlers
- * are overwritten by nested_vmx_setup() when nested=1.
+ * are overwritten by nested_vmx_hardware_setup() when nested=1.
*/
static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
{
@@ -6142,9 +6176,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
- u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6164,6 +6197,15 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
}
}
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code)
+{
+ *intr_info = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
+ else
+ *error_code = 0;
+}
+
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
{
if (vmx->pml_pg) {
@@ -6175,32 +6217,40 @@ static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u16 pml_idx, pml_tail_index;
u64 *pml_buf;
- u16 pml_idx;
+ int i;
pml_idx = vmcs_read16(GUEST_PML_INDEX);
/* Do nothing if PML buffer is empty */
- if (pml_idx == (PML_ENTITY_NUM - 1))
+ if (pml_idx == PML_HEAD_INDEX)
return;
+ /*
+ * PML index always points to the next available PML buffer entity
+ * unless PML log has just overflowed.
+ */
+ pml_tail_index = (pml_idx >= PML_LOG_NR_ENTRIES) ? 0 : pml_idx + 1;
- /* PML index always points to next available PML buffer entity */
- if (pml_idx >= PML_ENTITY_NUM)
- pml_idx = 0;
- else
- pml_idx++;
-
+ /*
+ * PML log is written backwards: the CPU first writes the entry 511
+ * then the entry 510, and so on.
+ *
+ * Read the entries in the same order they were written, to ensure that
+ * the dirty ring is filled in the same order the CPU wrote them.
+ */
pml_buf = page_address(vmx->pml_pg);
- for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+
+ for (i = PML_HEAD_INDEX; i >= pml_tail_index; i--) {
u64 gpa;
- gpa = pml_buf[pml_idx];
+ gpa = pml_buf[i];
WARN_ON(gpa & (PAGE_SIZE - 1));
kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
- vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+ vmcs_write16(GUEST_PML_INDEX, PML_HEAD_INDEX);
}
static void vmx_dump_sel(char *name, uint32_t sel)
@@ -6402,6 +6452,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -6498,33 +6566,15 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
return 0;
}
- /*
- * Note:
- * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
- * delivery event since it indicates guest is accessing MMIO.
- * The vm-exit can be triggered again after return to guest that
- * will cause infinite loop.
- */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
- exit_reason.basic != EXIT_REASON_NOTIFY)) {
- int ndata = 3;
-
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason.full;
- vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.data[ndata++] =
- vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- }
- vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
- vcpu->run->internal.ndata = ndata;
+ exit_reason.basic != EXIT_REASON_NOTIFY &&
+ exit_reason.basic != EXIT_REASON_EPT_MISCONFIG)) {
+ kvm_prepare_event_vectoring_exit(vcpu, INVALID_GPA);
return 0;
}
@@ -6552,7 +6602,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
-#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
@@ -6587,7 +6637,7 @@ unexpected_vmexit:
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
@@ -6627,9 +6677,10 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
bool flush_l1d;
/*
- * Clear the per-vcpu flush bit, it gets set again
- * either from vcpu_run() or from one of the unsafe
- * VMEXIT handlers.
+ * Clear the per-vcpu flush bit, it gets set again if the vCPU
+ * is reloaded, i.e. if the vCPU is scheduled out or if KVM
+ * exits to userspace, or if KVM reaches one of the unsafe
+ * VMEXIT handlers, e.g. if KVM calls into the emulator.
*/
flush_l1d = vcpu->arch.l1tf_flush_l1d;
vcpu->arch.l1tf_flush_l1d = false;
@@ -6675,7 +6726,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6745,14 +6796,16 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *slot;
+ struct page *refcounted_page;
unsigned long mmu_seq;
kvm_pfn_t pfn;
+ bool writable;
/* Defer reload until vmcs01 is the current VMCS. */
if (is_guest_mode(vcpu)) {
@@ -6788,37 +6841,58 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
* controls the APIC-access page memslot, and only deletes the memslot
* if APICv is permanently inhibited, i.e. the memslot won't reappear.
*/
- pfn = gfn_to_pfn_memslot(slot, gfn);
+ pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page);
if (is_error_noslot_pfn(pfn))
return;
read_lock(&vcpu->kvm->mmu_lock);
- if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
+ if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn))
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- read_unlock(&vcpu->kvm->mmu_lock);
- goto out;
- }
+ else
+ vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
- vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn));
- read_unlock(&vcpu->kvm->mmu_lock);
+ /*
+ * Do not pin the APIC access page in memory so that it can be freely
+ * migrated, the MMU notifier will call us again if it is migrated or
+ * swapped out. KVM backs the memslot with anonymous memory, the pfn
+ * should always point at a refcounted page (if the pfn is valid).
+ */
+ if (!WARN_ON_ONCE(!refcounted_page))
+ kvm_release_page_clean(refcounted_page);
/*
* No need for a manual TLB flush at this point, KVM has already done a
* flush if there were SPTEs pointing at the previous page.
*/
-out:
- /*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
- */
- kvm_release_pfn_clean(pfn);
+ read_unlock(&vcpu->kvm->mmu_lock);
}
-static void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
u16 status;
u8 old;
+ /*
+ * If L2 is active, defer the SVI update until vmcs01 is loaded, as SVI
+ * is only relevant for if and only if Virtual Interrupt Delivery is
+ * enabled in vmcs12, and if VID is enabled then L2 EOIs affect L2's
+ * vAPIC, not L1's vAPIC. KVM must update vmcs01 on the next nested
+ * VM-Exit, otherwise L1 with run with a stale SVI.
+ */
+ if (is_guest_mode(vcpu)) {
+ /*
+ * KVM is supposed to forward intercepted L2 EOIs to L1 if VID
+ * is enabled in vmcs12; as above, the EOIs affect L2's vAPIC.
+ * Note, userspace can stuff state while L2 is active; assert
+ * that VID is disabled if and only if the vCPU is in KVM_RUN
+ * to avoid false positives if userspace is setting APIC state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run &&
+ nested_cpu_has_vid(get_vmcs12(vcpu)));
+ to_vmx(vcpu)->nested.update_vmcs01_hwapic_isr = true;
+ return;
+ }
+
if (max_isr == -1)
max_isr = 0;
@@ -6848,21 +6922,7 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
-{
- /*
- * When running L2, updating RVI is only relevant when
- * vmcs12 virtual-interrupt-delivery enabled.
- * However, it can be enabled only when L1 also
- * intercepts external-interrupts and in that case
- * we should not update vmcs02 RVI but instead intercept
- * interrupt. Therefore, do nothing when running L2.
- */
- if (!is_guest_mode(vcpu))
- vmx_set_rvi(max_irr);
-}
-
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -6908,7 +6968,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6919,7 +6979,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6950,39 +7010,39 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* if exit due to NM, handle before interrupts are enabled */
else if (is_nm_fault(intr_info))
- handle_nm_fault_irqoff(&vmx->vcpu);
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
- gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
- vmx_do_interrupt_irqoff(gate_offset(desc));
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
+ else
+ vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base + vector));
kvm_after_interrupt(vcpu);
vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6990,16 +7050,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
return;
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_irqoff(vmx);
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7122,7 +7182,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -7155,13 +7215,13 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host, false);
}
-static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
u32 delta_tsc;
- if (vmx->req_immediate_exit) {
+ if (force_immediate_exit) {
vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
vmx->loaded_vmcs->hv_timer_soft_disabled = false;
} else if (vmx->hv_deadline_tsc != -1) {
@@ -7214,13 +7274,24 @@ void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
barrier_nospec();
}
-static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
+ bool force_immediate_exit)
{
+ /*
+ * If L2 is active, some VMX preemption timer exits can be handled in
+ * the fastpath even, all other exits must use the slow path.
+ */
+ if (is_guest_mode(vcpu) &&
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
+ return EXIT_FASTPATH_NONE;
+
switch (to_vmx(vcpu)->exit_reason.basic) {
case EXIT_REASON_MSR_WRITE:
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
- return handle_fastpath_preemption_timer(vcpu);
+ return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7272,7 +7343,10 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
is_nmi(vmx_get_intr_info(vcpu))) {
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
- vmx_do_nmi_irqoff();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
+ else
+ vmx_do_nmi_irqoff();
kvm_after_interrupt(vcpu);
}
@@ -7280,7 +7354,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7307,7 +7381,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_NONE;
}
- trace_kvm_entry(vcpu);
+ trace_kvm_entry(vcpu, force_immediate_exit);
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -7345,10 +7419,6 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
- if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
- set_debugreg(vcpu->arch.dr6, 6);
-
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7366,7 +7436,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_passthrough_lbr_msrs(vcpu);
if (enable_preemption_timer)
- vmx_update_hv_timer(vcpu);
+ vmx_update_hv_timer(vcpu, force_immediate_exit);
+ else if (force_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
kvm_wait_lapic_expire(vcpu);
@@ -7382,8 +7454,8 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
}
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
- if (vmx->host_debugctlmsr)
- update_debugctlmsr(vmx->host_debugctlmsr);
+ if (vcpu->arch.host_debugctl)
+ update_debugctlmsr(vcpu->arch.host_debugctl);
#ifndef CONFIG_X86_64
/*
@@ -7430,13 +7502,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
- if (is_guest_mode(vcpu))
- return EXIT_FASTPATH_NONE;
-
- return vmx_exit_handlers_fastpath(vcpu);
+ return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7445,9 +7514,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7538,6 +7608,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
@@ -7556,7 +7640,7 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7587,41 +7671,25 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
- * memory aliases with conflicting memory types and sometimes MCEs.
- * We have to be careful as to what are honored and when.
- *
- * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
- * UC. The effective memory type is UC or WC depending on guest PAT.
- * This was historically the source of MCEs and we want to be
- * conservative.
- *
- * When there is no need to deal with noncoherent DMA (e.g., no VT-d
- * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
- * EPT memory type is set to WB. The effective memory type is forced
- * WB.
- *
- * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
- * EPT memory type is used to emulate guest CD/MTRR.
+ /*
+ * Force UC for host MMIO regions, as allowing the guest to access MMIO
+ * with cacheable accesses will result in Machine Checks.
*/
-
if (is_mmio)
return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+ /*
+ * Force WB and ignore guest PAT if the VM does NOT have a non-coherent
+ * device attached. Letting the guest control memory types on Intel
+ * CPUs may result in unexpected behavior, and so KVM's ABI is to trust
+ * the guest to behave only as a last resort.
+ */
if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
- if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
- if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
- return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
- else
- return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) |
- VMX_EPT_IPAT_BIT;
- }
-
- return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT);
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7759,7 +7827,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7768,12 +7836,8 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
* set if and only if XSAVE is supported.
*/
- if (boot_cpu_has(X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
-
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
- kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVE))
+ guest_cpu_cap_clear(vcpu, X86_FEATURE_XSAVES);
vmx_setup_uret_msrs(vmx);
@@ -7781,7 +7845,7 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmcs_set_secondary_exec_control(vmx,
vmx_secondary_exec_control(vmx));
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
@@ -7790,25 +7854,25 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
- if (guest_can_use(vcpu, X86_FEATURE_VMX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX))
nested_vmx_cr_fixed1_bits_update(vcpu);
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
- guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
+ guest_cpu_cap_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
if (boot_cpu_has(X86_FEATURE_RTM)) {
struct vmx_uret_msr *msr;
msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (msr) {
- bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ bool enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_RTM);
vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
}
}
if (kvm_cpu_cap_has(X86_FEATURE_XFD))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD));
if (boot_cpu_has(X86_FEATURE_IBPB))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
@@ -7816,17 +7880,17 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D));
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX))
vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED;
else
vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
- if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX_LC))
vmx->msr_ia32_feature_control_valid_bits |=
FEAT_CTL_SGX_LC_ENABLED;
else
@@ -7837,10 +7901,9 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vmx_update_exception_bitmap(vcpu);
}
-static u64 vmx_get_perf_capabilities(void)
+static __init u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = PMU_CAP_FW_WRITES;
- struct x86_pmu_lbr lbr;
u64 host_perf_cap = 0;
if (!enable_pmu)
@@ -7850,15 +7913,43 @@ static u64 vmx_get_perf_capabilities(void)
rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
- x86_perf_get_lbr(&lbr);
- if (lbr.nr)
+ x86_perf_get_lbr(&vmx_lbr_caps);
+
+ /*
+ * KVM requires LBR callstack support, as the overhead due to
+ * context switching LBRs without said support is too high.
+ * See intel_pmu_create_guest_lbr_event() for more info.
+ */
+ if (!vmx_lbr_caps.has_callstack)
+ memset(&vmx_lbr_caps, 0, sizeof(vmx_lbr_caps));
+ else if (vmx_lbr_caps.nr)
perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT;
}
if (vmx_pebs_supported()) {
perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK;
- if ((perf_cap & PERF_CAP_PEBS_FORMAT) < 4)
- perf_cap &= ~PERF_CAP_PEBS_BASELINE;
+
+ /*
+ * Disallow adaptive PEBS as it is functionally broken, can be
+ * used by the guest to read *host* LBRs, and can be used to
+ * bypass userspace event filters. To correctly and safely
+ * support adaptive PEBS, KVM needs to:
+ *
+ * 1. Account for the ADAPTIVE flag when (re)programming fixed
+ * counters.
+ *
+ * 2. Gain support from perf (or take direct control of counter
+ * programming) to support events without adaptive PEBS
+ * enabled for the hardware counter.
+ *
+ * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
+ * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
+ *
+ * 4. Document which PMU events are effectively exposed to the
+ * guest via adaptive PEBS, and make adaptive PEBS mutually
+ * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
+ */
+ perf_cap &= ~PERF_CAP_PEBS_BASELINE;
}
return perf_cap;
@@ -7893,6 +7984,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
@@ -7913,11 +8005,6 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
-static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
-{
- to_vmx(vcpu)->req_immediate_exit = true;
-}
-
static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info)
{
@@ -7952,10 +8039,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -8035,8 +8122,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -8075,18 +8162,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
-{
- if (!kvm_pause_in_guest(vcpu->kvm))
- shrink_ple_window(vcpu);
-}
-
void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8110,7 +8191,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -8121,7 +8202,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_SMM
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -8129,7 +8210,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8150,7 +8231,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -8171,18 +8252,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -8192,7 +8273,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void vmx_hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -8202,18 +8283,7 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-#define VMX_REQUIRED_APICV_INHIBITS \
-( \
- BIT(APICV_INHIBIT_REASON_DISABLE)| \
- BIT(APICV_INHIBIT_REASON_ABSENT) | \
- BIT(APICV_INHIBIT_REASON_HYPERV) | \
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
- BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
-)
-
-static void vmx_vm_destroy(struct kvm *kvm)
+void vmx_vm_destroy(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
@@ -8264,150 +8334,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = KBUILD_MODNAME,
-
- .check_processor_compatibility = vmx_check_processor_compat,
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
- .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
-#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-#endif
-
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-
- .get_untagged_addr = vmx_get_untagged_addr,
-};
-
static unsigned int vmx_handle_intel_pt_intr(void)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
@@ -8453,18 +8379,16 @@ static void __init vmx_setup_me_spte_mask(void)
u64 me_mask = 0;
/*
- * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
- * the former to avoid exposing shadow_phys_bits.
- *
* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
- * shadow_phys_bits. On MKTME and/or TDX capable systems,
+ * kvm_host.maxphyaddr. On MKTME and/or TDX capable systems,
* boot_cpu_data.x86_phys_bits holds the actual physical address
- * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
- * reported by CPUID. Those bits between are KeyID bits.
+ * w/o the KeyID bits, and kvm_host.maxphyaddr equals to
+ * MAXPHYADDR reported by CPUID. Those bits between are KeyID bits.
*/
- if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
+ if (boot_cpu_data.x86_phys_bits != kvm_host.maxphyaddr)
me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits,
- kvm_get_shadow_phys_bits() - 1);
+ kvm_host.maxphyaddr - 1);
+
/*
* Unlike SME, host kernel doesn't support setting up any
* MKTME KeyID on Intel platforms. No memory encryption
@@ -8473,9 +8397,7 @@ static void __init vmx_setup_me_spte_mask(void)
kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata;
-
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
@@ -8489,10 +8411,6 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
- if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
-
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -8544,16 +8462,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
- vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8568,7 +8486,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
- vmx_x86_ops.sync_pir_to_irr = NULL;
+ vt_x86_ops.sync_pir_to_irr = NULL;
if (!enable_apicv || !cpu_has_vmx_ipiv())
enable_ipiv = false;
@@ -8604,7 +8522,7 @@ static __init int hardware_setup(void)
enable_pml = 0;
if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
+ vt_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8613,7 +8531,7 @@ static __init int hardware_setup(void)
u64 use_timer_freq = 5000ULL * 1000 * 1000;
cpu_preemption_timer_multi =
- vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -8629,9 +8547,8 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
- vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8642,9 +8559,9 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
- vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
else
- vmx_init_ops.handle_intel_pt_intr = NULL;
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -8667,14 +8584,6 @@ static __init int hardware_setup(void)
return r;
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
-};
-
static void vmx_cleanup_l1d_flush(void)
{
if (vmx_l1d_flush_pages) {
@@ -8689,17 +8598,15 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
vmx_cleanup_l1d_flush();
}
-static void vmx_exit(void)
+static void __exit vmx_exit(void)
{
kvm_exit();
+ __vmx_exit();
kvm_x86_vendor_exit();
- __vmx_exit();
}
module_exit(vmx_exit);
@@ -8716,7 +8623,7 @@ static int __init vmx_init(void)
*/
hv_init_evmcs();
- r = kvm_x86_vendor_init(&vmx_init_ops);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
@@ -8737,8 +8644,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
vmx_check_vmcs12_offsets();
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index e3b0985bb74a..951e44dc9d0e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -7,18 +7,15 @@
#include <asm/kvm.h>
#include <asm/intel_pt.h>
#include <asm/perf_event.h>
+#include <asm/posted_intr.h>
#include "capabilities.h"
#include "../kvm_cache_regs.h"
-#include "posted_intr.h"
#include "vmcs.h"
#include "vmx_ops.h"
#include "../cpuid.h"
#include "run_flags.h"
-
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-#define MSR_TYPE_RW 3
+#include "../mmu.h"
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
@@ -109,6 +106,8 @@ struct lbr_desc {
bool msr_passthrough;
};
+extern struct x86_pmu_lbr vmx_lbr_caps;
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -177,6 +176,7 @@ struct nested_vmx {
bool reload_vmcs01_apic_access_page;
bool update_vmcs01_cpu_dirty_logging;
bool update_vmcs01_apicv_status;
+ bool update_vmcs01_hwapic_isr;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -201,8 +201,6 @@ struct nested_vmx {
struct kvm_host_map virtual_apic_map;
struct kvm_host_map pi_desc_map;
- struct kvm_host_map msr_bitmap_map;
-
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
@@ -332,17 +330,16 @@ struct vcpu_vmx {
unsigned int ple_window;
bool ple_window_dirty;
- bool req_immediate_exit;
-
/* Support for PML */
-#define PML_ENTITY_NUM 512
+#define PML_LOG_NR_ENTRIES 512
+ /* PML is written backwards: this is the first entry written by the CPU */
+#define PML_HEAD_INDEX (PML_LOG_NR_ENTRIES-1)
+
struct page *pml_pg;
/* apic deadline value in host tsc */
u64 hv_deadline_tsc;
- unsigned long host_debugctlmsr;
-
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
@@ -364,6 +361,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -385,6 +385,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
+int vmx_get_cpl_no_cache(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -402,6 +403,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
@@ -576,7 +578,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
@@ -721,7 +724,8 @@ static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
if (!enable_ept)
return true;
- return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+ return allow_smaller_maxphyaddr &&
+ cpuid_maxphyaddr(vcpu) < kvm_host.maxphyaddr;
}
static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
@@ -749,4 +753,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index eb48153bfd73..cdf8cbb69209 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -3,7 +3,7 @@
#ifndef __ARCH_X86_KVM_VMX_ONHYPERV_H__
#define __ARCH_X86_KVM_VMX_ONHYPERV_H__
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
#include <asm/mshyperv.h>
#include <linux/jump_label.h>
@@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 8060e5fc6dbd..633c87e2fd92 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -15,7 +15,7 @@ void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
void invvpid_error(unsigned long ext, u16 vpid, gva_t gva);
-void invept_error(unsigned long ext, u64 eptp, gpa_t gpa);
+void invept_error(unsigned long ext, u64 eptp);
#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
/*
@@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
+ "16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}
@@ -312,13 +312,13 @@ static inline void __invvpid(unsigned long ext, u16 vpid, gva_t gva)
vmx_asm2(invvpid, "r"(ext), "m"(operand), ext, vpid, gva);
}
-static inline void __invept(unsigned long ext, u64 eptp, gpa_t gpa)
+static inline void __invept(unsigned long ext, u64 eptp)
{
struct {
- u64 eptp, gpa;
- } operand = {eptp, gpa};
-
- vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
+ u64 eptp;
+ u64 reserved_0;
+ } operand = { eptp, 0 };
+ vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp);
}
static inline void vpid_sync_vcpu_single(int vpid)
@@ -355,13 +355,13 @@ static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
static inline void ept_sync_global(void)
{
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0);
}
static inline void ept_sync_context(u64 eptp)
{
if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp);
else
ept_sync_global();
}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..430773a5ef8e
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
+void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_feature_msr(u32 msr, u64 *data);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+void vmx_get_entry_info(struct kvm_vcpu *vcpu, u32 *intr_info, u32 *error_code);
+
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */