summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2024-09-14 09:56:06 -0400
committerPaolo Bonzini <pbonzini@redhat.com>2024-09-17 12:41:23 -0400
commit3f8df6285271d9d8f17d733433e5213a63b83a0b (patch)
tree38010518e32047a3d798b4c3cd226810e0c8ab4b /arch/x86/kvm/vmx
parent55e6f8f29d6ac76126ad1c8000b4c3626cf4b176 (diff)
parentf3009482512eb057e7161214a068c6bd7bae83a4 (diff)
Merge tag 'kvm-x86-vmx-6.12' of https://github.com/kvm-x86/linux into HEAD
KVM VMX changes for 6.12: - Set FINAL/PAGE in the page fault error code for EPT Violations if and only if the GVA is valid. If the GVA is NOT valid, there is no guest-side page table walk and so stuffing paging related metadata is nonsensical. - Fix a bug where KVM would incorrectly synthesize a nested VM-Exit instead of emulating posted interrupt delivery to L2. - Add a lockdep assertion to detect unsafe accesses of vmcs12 structures. - Harden eVMCS loading against an impossible NULL pointer deref (really truly should be impossible). - Minor SGX fix and a cleanup.
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/nested.c72
-rw-r--r--arch/x86/kvm/vmx/nested.h6
-rw-r--r--arch/x86/kvm/vmx/sgx.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c17
-rw-r--r--arch/x86/kvm/vmx/vmx.h5
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h8
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h2
7 files changed, 86 insertions, 26 deletions
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 97ecc2722c8f..a8e7bc04d9bf 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -981,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- if (kvm_set_msr(vcpu, e.index, e.value)) {
+ if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -1017,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
- if (kvm_get_msr(vcpu, msr_index, data)) {
+ if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@@ -1112,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
- * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
- * instead of reading the value from the vmcs02 VMExit
- * MSR-store area.
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@@ -2341,10 +2341,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
- if (nested_cpu_has_posted_intr(vmcs12))
+ if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- else
+ } else {
+ vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
pin_controls_set(vmx, exec_control);
/*
@@ -2494,6 +2496,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@ -2531,7 +2534,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmx->segment_cache.bitmask = 0;
+ vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -4308,11 +4311,52 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
if (block_nested_events)
return -EBUSY;
if (!nested_exit_on_intr(vcpu))
goto no_vmexit;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@@ -4830,7 +4874,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- if (kvm_set_msr(vcpu, h.index, h.value)) {
+ if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4993,14 +5037,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- nested_exit_intr_ack_set(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 0782fe599757..2c296b6abb8c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6fef01e0536e..a3c3d2a51f47 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
- contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 9cfcfebd5f99..c67e448c6ebd 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -525,10 +525,6 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
@@ -4219,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -5804,8 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@@ -7969,6 +7973,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 3839afb921e2..2325f773a20b 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -752,4 +752,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index eb48153bfd73..bba24ed99ee6 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 8060e5fc6dbd..93e020dc88f6 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
+ "16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}