diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2025-03-19 09:04:48 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2025-03-19 09:04:48 -0400 |
commit | 4d9a677596afccb91464bb9d76e2bac0717dad1f (patch) | |
tree | cc301456bc7b0725cd7104dddfe0d11f323d1e4e | |
parent | 4286a3ec2595697f398ce1dc895a2f5b006dae99 (diff) | |
parent | e6c8728a8e2d20b262209c70a8ca67719a628833 (diff) |
Merge tag 'kvm-x86-misc-6.15' of https://github.com/kvm-x86/linux into HEAD
KVM x86 misc changes for 6.15:
- Fix a bug in PIC emulation that caused KVM to emit a spurious KVM_REQ_EVENT.
- Add a helper to consolidate handling of mp_state transitions, and use it to
clear pv_unhalted whenever a vCPU is made RUNNABLE.
- Defer runtime CPUID updates until KVM emulates a CPUID instruction, to
coalesce updates when multiple pieces of vCPU state are changing, e.g. as
part of a nested transition.
- Fix a variety of nested emulation bugs, and add VMX support for synthesizing
nested VM-Exit on interception (instead of injecting #UD into L2).
- Drop "support" for PV Async #PF with proctected guests without SEND_ALWAYS,
as KVM can't get the current CPL.
- Misc cleanups
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 54 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_emulate.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/smm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/nested.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm/sev.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/svm/svm.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 18 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.h | 22 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 118 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 136 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 7 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 4 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/Makefile.kvm | 1 | ||||
-rw-r--r-- | tools/testing/selftests/kvm/x86/nested_emulation_test.c | 146 |
20 files changed, 416 insertions, 181 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 667b93071252..9c3d57c5f679 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -881,6 +881,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; + bool cpuid_dynamic_bits_dirty; bool is_amd_compatible; /* @@ -998,8 +999,8 @@ struct kvm_vcpu_arch { u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ u16 vec; u32 id; - bool send_user_only; u32 host_apf_flags; + bool send_always; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; @@ -1355,8 +1356,6 @@ struct kvm_arch { u64 shadow_mmio_value; - struct iommu_domain *iommu_domain; - bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE @@ -2166,8 +2165,8 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 121edf1f2a79..f863a6f93a1a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -58,25 +58,24 @@ void __init kvm_init_xstate_sizes(void) u32 xstate_required_size(u64 xstate_bv, bool compacted) { - int feature_bit = 0; u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + int i; xstate_bv &= XFEATURE_MASK_EXTEND; - while (xstate_bv) { - if (xstate_bv & 0x1) { - struct cpuid_xstate_sizes *xs = &xstate_sizes[feature_bit]; - u32 offset; - - /* ECX[1]: 64B alignment in compacted form */ - if (compacted) - offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; - else - offset = xs->ebx; - ret = max(ret, offset + xs->eax); - } + for (i = XFEATURE_YMM; i < ARRAY_SIZE(xstate_sizes) && xstate_bv; i++) { + struct cpuid_xstate_sizes *xs = &xstate_sizes[i]; + u32 offset; + + if (!(xstate_bv & BIT_ULL(i))) + continue; - xstate_bv >>= 1; - feature_bit++; + /* ECX[1]: 64B alignment in compacted form */ + if (compacted) + offset = (xs->ecx & 0x2) ? ALIGN(ret, 64) : ret; + else + offset = xs->ebx; + ret = max(ret, offset + xs->eax); + xstate_bv &= ~BIT_ULL(i); } return ret; @@ -196,6 +195,7 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu) } static u32 kvm_apply_cpuid_pv_features_quirk(struct kvm_vcpu *vcpu); +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); /* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, @@ -300,10 +300,12 @@ static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, guest_cpu_cap_change(vcpu, x86_feature, has_feature); } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + vcpu->arch.cpuid_dynamic_bits_dirty = false; + best = kvm_find_cpuid_entry(vcpu, 1); if (best) { kvm_update_feature_runtime(vcpu, best, X86_FEATURE_OSXSAVE, @@ -333,7 +335,6 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); } -EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) { @@ -646,6 +647,9 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, if (cpuid->nent < vcpu->arch.cpuid_nent) return -E2BIG; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) return -EFAULT; @@ -1704,7 +1708,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) phys_as = entry->eax & 0xff; g_phys_as = phys_as; if (kvm_mmu_get_max_tdp_level() < 5) - g_phys_as = min(g_phys_as, 48); + g_phys_as = min(g_phys_as, 48U); } entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); @@ -1769,13 +1773,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) cpuid_entry_override(entry, CPUID_8000_0022_EAX); - if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) - ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; - else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; - else - ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; - + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; entry->ebx = ebx.full; break; } @@ -1985,6 +1983,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, struct kvm_cpuid_entry2 *entry; bool exact, used_max_basic = false; + if (vcpu->arch.cpuid_dynamic_bits_dirty) + kvm_update_cpuid_runtime(vcpu); + entry = kvm_find_cpuid_entry_index(vcpu, function, index); exact = !!entry; @@ -2000,7 +2001,8 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, *edx = entry->edx; if (function == 7 && index == 0) { u64 data; - if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + if ((*ebx & (feature_bit(RTM) | feature_bit(HLE))) && + !__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && (data & TSX_CTRL_CPUID_CLEAR)) *ebx &= ~(feature_bit(RTM) | feature_bit(HLE)); } else if (function == 0x80000007) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 67d80aa72d50..d2884162a46a 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -11,7 +11,6 @@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu); -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, @@ -232,6 +231,14 @@ static __always_inline bool guest_cpu_cap_has(struct kvm_vcpu *vcpu, { unsigned int x86_leaf = __feature_leaf(x86_feature); + /* + * Except for MWAIT, querying dynamic feature bits is disallowed, so + * that KVM can defer runtime updates until the next CPUID emulation. + */ + BUILD_BUG_ON(x86_feature == X86_FEATURE_APIC || + x86_feature == X86_FEATURE_OSXSAVE || + x86_feature == X86_FEATURE_OSPKE); + return vcpu->arch.cpu_caps[x86_leaf] & __feature_bit(x86_feature); } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 60986f67c35a..1349e278cd2a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -477,8 +477,11 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, + .src_type = ctxt->src.type, + .dst_type = ctxt->dst.type, .ad_bytes = ctxt->ad_bytes, - .next_rip = ctxt->eip, + .rip = ctxt->eip, + .next_rip = ctxt->_eip, }; return ctxt->ops->intercept(ctxt, &info, stage); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 8dec646e764b..a8fb19940975 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -567,7 +567,7 @@ static void pic_irq_request(struct kvm *kvm, int level) { struct kvm_pic *s = kvm->arch.vpic; - if (!s->output) + if (!s->output && level) s->wakeup_needed = true; s->output = level; } diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index 73072585e164..c1df5acfacaf 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -44,7 +44,10 @@ struct x86_instruction_info { u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ + u8 src_type; /* type of source operand */ + u8 dst_type; /* type of destination operand */ u8 ad_bytes; /* size of src/dst address */ + u64 rip; /* rip of the instruction */ u64 next_rip; /* rip following the instruction */ }; @@ -272,8 +275,10 @@ struct operand { }; }; +#define X86_MAX_INSTRUCTION_LENGTH 15 + struct fetch_cache { - u8 data[15]; + u8 data[X86_MAX_INSTRUCTION_LENGTH]; u8 *ptr; u8 *end; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a009c94c26c2..9dbc0f5d9865 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -221,13 +221,6 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, } } -static void kvm_apic_map_free(struct rcu_head *rcu) -{ - struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - - kvfree(map); -} - static int kvm_recalculate_phys_map(struct kvm_apic_map *new, struct kvm_vcpu *vcpu, bool *xapic_id_mismatch) @@ -489,7 +482,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - call_rcu(&old->rcu, kvm_apic_map_free); + kvfree_rcu(old, rcu); kvm_make_scan_ioapic_request(kvm); } @@ -2593,7 +2586,7 @@ static void __kvm_apic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; if (!apic) return; @@ -3397,9 +3390,9 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) { kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); } if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) { if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { @@ -3408,7 +3401,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu) sipi_vector = apic->sipi_vector; kvm_x86_call(vcpu_deliver_sipi_vector)(vcpu, sipi_vector); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } } return 0; diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index e0ab7df27b66..699e551ec93b 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -358,7 +358,7 @@ void enter_smm(struct kvm_vcpu *vcpu) goto error; #endif - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; kvm_mmu_reset_context(vcpu); return; error: diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 04c375bf1ac2..834b67672d50 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -994,7 +994,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); /* in case we halted in L2 */ - svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); /* Give the current vmcb to the guest */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 661108d65ee7..596167d33544 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3271,7 +3271,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) { vcpu->arch.xcr0 = ghcb_get_xcr0(ghcb); - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } /* Copy the GHCB exit information into the VMCB fields */ @@ -3855,7 +3855,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) /* Mark the vCPU as offline and not runnable */ vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); /* Clear use of the VMSA */ svm->vmcb->control.vmsa_pa = INVALID_PAGE; @@ -3893,8 +3893,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) svm->vmcb->control.vmsa_pa = pfn_to_hpa(pfn); /* Mark the vCPU as runnable */ - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); svm->sev_es.snp_vmsa_gpa = INVALID_PAGE; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e67de787fc71..5176e2f6cc14 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1932,7 +1932,7 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -4137,20 +4137,23 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; svm->nmi_l1_to_l2 = nmi_l1_to_l2; break; - case SVM_EXITINTINFO_TYPE_EXEPT: + case SVM_EXITINTINFO_TYPE_EXEPT: { + u32 error_code = 0; + /* * Never re-inject a #VC exception. */ if (vector == X86_TRAP_VC) break; - if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { - u32 err = svm->vmcb->control.exit_int_info_err; - kvm_requeue_exception_e(vcpu, vector, err); + if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) + error_code = svm->vmcb->control.exit_int_info_err; - } else - kvm_requeue_exception(vcpu, vector); + kvm_requeue_exception(vcpu, vector, + exitintinfo & SVM_EXITINTINFO_VALID_ERR, + error_code); break; + } case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(vcpu, vector, false); break; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 0b844cb97978..ccda95e53f62 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -830,12 +830,12 @@ TRACE_EVENT(kvm_emulate_insn, TP_ARGS(vcpu, failed), TP_STRUCT__entry( - __field( __u64, rip ) - __field( __u32, csbase ) - __field( __u8, len ) - __array( __u8, insn, 15 ) - __field( __u8, flags ) - __field( __u8, failed ) + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, X86_MAX_INSTRUCTION_LENGTH ) + __field( __u8, flags ) + __field( __u8, failed ) ), TP_fast_assign( @@ -846,7 +846,7 @@ TRACE_EVENT(kvm_emulate_insn, __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt->fetch.data, - 15); + X86_MAX_INSTRUCTION_LENGTH); __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ed8a3cb53961..d06e50d9c0e7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2970,7 +2970,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: case INTR_TYPE_SOFT_INTR: case INTR_TYPE_PRIV_SW_EXCEPTION: - if (CC(vmcs12->vm_entry_instruction_len > 15) || + if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || CC(vmcs12->vm_entry_instruction_len == 0 && CC(!nested_cpu_has_zero_length_injection(vcpu)))) return -EINVAL; @@ -3771,7 +3771,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) break; case GUEST_ACTIVITY_WAIT_SIPI: vmx->nested.nested_run_pending = 0; - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: break; @@ -4618,7 +4618,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) */ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 vm_exit_reason, u32 exit_intr_info, - unsigned long exit_qualification) + unsigned long exit_qualification, u32 exit_insn_len) { /* update exit information fields: */ vmcs12->vm_exit_reason = vm_exit_reason; @@ -4646,7 +4646,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vm_exit_reason, exit_intr_info); vmcs12->vm_exit_intr_info = exit_intr_info; - vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs12->vm_exit_instruction_len = exit_insn_len; vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); /* @@ -4930,8 +4930,9 @@ vmabort: * and modify vmcs12 to make it see what it would expect to see there if * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) */ -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification) +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -4981,7 +4982,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, if (vm_exit_reason != -1) prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, - exit_intr_info, exit_qualification); + exit_intr_info, exit_qualification, + exit_insn_len); /* * Must happen outside of sync_vmcs02_to_vmcs12() as it will @@ -5071,7 +5073,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->nested.need_vmcs12_to_shadow_sync = true; /* in case we halted in L2 */ - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (likely(!vmx->fail)) { if (vm_exit_reason != -1) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 2c296b6abb8c..6eedcfc91070 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -26,8 +26,26 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu); -void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, - u32 exit_intr_info, unsigned long exit_qualification); +void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, unsigned long exit_qualification, + u32 exit_insn_len); + +static inline void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, + u32 exit_intr_info, + unsigned long exit_qualification) +{ + u32 exit_insn_len; + + if (to_vmx(vcpu)->fail || vm_exit_reason == -1 || + (vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_insn_len = 0; + else + exit_insn_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + + __nested_vmx_vmexit(vcpu, vm_exit_reason, exit_intr_info, + exit_qualification, exit_insn_len); +} + void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3b92f893b239..1e30adc7837e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3519,7 +3519,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) @@ -7157,13 +7157,17 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, case INTR_TYPE_SOFT_EXCEPTION: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; - case INTR_TYPE_HARD_EXCEPTION: - if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(error_code_field); - kvm_requeue_exception_e(vcpu, vector, err); - } else - kvm_requeue_exception(vcpu, vector); + case INTR_TYPE_HARD_EXCEPTION: { + u32 error_code = 0; + + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) + error_code = vmcs_read32(error_code_field); + + kvm_requeue_exception(vcpu, vector, + idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK, + error_code); break; + } case INTR_TYPE_SOFT_INTR: vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); fallthrough; @@ -8005,38 +8009,50 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); } -static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, - struct x86_instruction_info *info) +static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + unsigned long *exit_qualification) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); unsigned short port; - bool intercept; int size; + bool imm; + + /* + * If the 'use IO bitmaps' VM-execution control is 0, IO instruction + * VM-exits depend on the 'unconditional IO exiting' VM-execution + * control. + * + * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. + */ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { port = info->src_val; size = info->dst_bytes; + imm = info->src_type == OP_IMM; } else { port = info->dst_val; size = info->src_bytes; + imm = info->dst_type == OP_IMM; } - /* - * If the 'use IO bitmaps' VM-execution control is 0, IO instruction - * VM-exits depend on the 'unconditional IO exiting' VM-execution - * control. - * - * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. - */ - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) - intercept = nested_cpu_has(vmcs12, - CPU_BASED_UNCOND_IO_EXITING); - else - intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ - return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; + *exit_qualification = ((unsigned long)port << 16) | (size - 1); + + if (info->intercept == x86_intercept_ins || + info->intercept == x86_intercept_outs) + *exit_qualification |= BIT(4); + + if (info->rep_prefix) + *exit_qualification |= BIT(5); + + if (imm) + *exit_qualification |= BIT(6); + + return nested_vmx_check_io_bitmaps(vcpu, port, size); } int vmx_check_intercept(struct kvm_vcpu *vcpu, @@ -8045,26 +8061,34 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned long exit_qualification = 0; + u32 vm_exit_reason; + u64 exit_insn_len; switch (info->intercept) { - /* - * RDPID causes #UD if disabled through secondary execution controls. - * Because it is marked as EmulateOnUD, we need to intercept it here. - * Note, RDPID is hidden behind ENABLE_RDTSCP. - */ case x86_intercept_rdpid: + /* + * RDPID causes #UD if not enabled through secondary execution + * controls (ENABLE_RDTSCP). Note, the implicit MSR access to + * TSC_AUX is NOT subject to interception, i.e. checking only + * the dedicated execution control is architecturally correct. + */ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { exception->vector = UD_VECTOR; exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } - break; + return X86EMUL_CONTINUE; case x86_intercept_in: case x86_intercept_ins: case x86_intercept_out: case x86_intercept_outs: - return vmx_check_intercept_io(vcpu, info); + if (!vmx_is_io_intercepted(vcpu, info, &exit_qualification)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_IO_INSTRUCTION; + break; case x86_intercept_lgdt: case x86_intercept_lidt: @@ -8077,7 +8101,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) return X86EMUL_CONTINUE; - /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ + if (info->intercept == x86_intercept_lldt || + info->intercept == x86_intercept_ltr || + info->intercept == x86_intercept_sldt || + info->intercept == x86_intercept_str) + vm_exit_reason = EXIT_REASON_LDTR_TR; + else + vm_exit_reason = EXIT_REASON_GDTR_IDTR; + /* + * FIXME: Decode the ModR/M to generate the correct exit + * qualification for memory operands. + */ + break; + + case x86_intercept_hlt: + if (!nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING)) + return X86EMUL_CONTINUE; + + vm_exit_reason = EXIT_REASON_HLT; break; case x86_intercept_pause: @@ -8090,17 +8131,24 @@ int vmx_check_intercept(struct kvm_vcpu *vcpu, * the PAUSE. */ if ((info->rep_prefix != REPE_PREFIX) || - !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) + !nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING)) return X86EMUL_CONTINUE; + vm_exit_reason = EXIT_REASON_PAUSE_INSTRUCTION; break; /* TODO: check more intercepts... */ default: - break; + return X86EMUL_UNHANDLEABLE; } - return X86EMUL_UNHANDLEABLE; + exit_insn_len = abs_diff((s64)info->next_rip, (s64)info->rip); + if (!exit_insn_len || exit_insn_len > X86_MAX_INSTRUCTION_LENGTH) + return X86EMUL_UNHANDLEABLE; + + __nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification, + exit_insn_len); + return X86EMUL_INTERCEPTED; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b64ab350bcd..82547da8faf3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -800,9 +800,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto ex->payload = payload; } -static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code, - bool has_payload, unsigned long payload, bool reinject) +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error, u32 error_code, + bool has_payload, unsigned long payload) { u32 prev_nr; int class1, class2; @@ -810,13 +810,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); /* - * If the exception is destined for L2 and isn't being reinjected, - * morph it to a VM-Exit if L1 wants to intercept the exception. A - * previously injected exception is not checked because it was checked - * when it was original queued, and re-checking is incorrect if _L1_ - * injected the exception, in which case it's exempt from interception. + * If the exception is destined for L2, morph it to a VM-Exit if L1 + * wants to intercept the exception. */ - if (!reinject && is_guest_mode(vcpu) && + if (is_guest_mode(vcpu) && kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, has_payload, payload); @@ -825,28 +822,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (reinject) { - /* - * On VM-Entry, an exception can be pending if and only - * if event injection was blocked by nested_run_pending. - * In that case, however, vcpu_enter_guest() requests an - * immediate exit, and the guest shouldn't proceed far - * enough to need reinjection. - */ - WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); - vcpu->arch.exception.injected = true; - if (WARN_ON_ONCE(has_payload)) { - /* - * A reinjected event has already - * delivered its payload. - */ - has_payload = false; - payload = 0; - } - } else { - vcpu->arch.exception.pending = true; - vcpu->arch.exception.injected = false; - } + vcpu->arch.exception.pending = true; + vcpu->arch.exception.injected = false; + vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; @@ -887,30 +865,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); -void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) -{ - kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); + kvm_multiple_exception(vcpu, nr, false, 0, true, payload); } EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) { - kvm_multiple_exception(vcpu, nr, true, error_code, - true, payload, false); + kvm_multiple_exception(vcpu, nr, true, error_code, true, payload); } +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr, + bool has_error_code, u32 error_code) +{ + + /* + * On VM-Entry, an exception can be pending if and only if event + * injection was blocked by nested_run_pending. In that case, however, + * vcpu_enter_guest() requests an immediate exit, and the guest + * shouldn't proceed far enough to need reinjection. + */ + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); + + /* + * Do not check for interception when injecting an event for L2, as the + * exception was checked for intercept when it was original queued, and + * re-checking is incorrect if _L1_ injected the exception, in which + * case it's exempt from interception. + */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + + vcpu->arch.exception.injected = true; + vcpu->arch.exception.has_error_code = has_error_code; + vcpu->arch.exception.vector = nr; + vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) @@ -980,16 +981,10 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); -void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) -{ - kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); -} -EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); - /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -1264,7 +1259,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; return 0; } @@ -2080,10 +2075,20 @@ EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) { - if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && - !guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT)) + bool enabled; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)) + goto emulate_as_nop; + + if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) + enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT); + else + enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT; + + if (!enabled) return kvm_handle_invalid_op(vcpu); +emulate_as_nop: pr_warn_once("%s instruction emulated as NOP!\n", insn); return kvm_emulate_as_nop(vcpu); } @@ -3544,7 +3549,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) sizeof(u64))) return 1; - vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); @@ -3889,7 +3894,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3924,7 +3929,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; - kvm_update_cpuid_runtime(vcpu); + vcpu->arch.cpuid_dynamic_bits_dirty = true; break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -11220,9 +11225,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: case KVM_MP_STATE_AP_RESET_HOLD: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; @@ -11299,9 +11302,8 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { if (kvm_vcpu_has_events(vcpu)) - vcpu->arch.pv.pv_unhalted = false; - else - vcpu->arch.mp_state = state; + state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, state); return 1; } else { vcpu->run->exit_reason = reason; @@ -11821,10 +11823,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { - vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); } else - vcpu->arch.mp_state = mp_state->mp_state; + kvm_set_mp_state(vcpu, mp_state->mp_state); kvm_make_request(KVM_REQ_EVENT, vcpu); ret = 0; @@ -11951,7 +11953,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && !is_protmode(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); return 0; } @@ -12254,9 +12256,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm); if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED); r = kvm_mmu_create(vcpu); if (r < 0) @@ -13383,8 +13385,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (vcpu->arch.apf.send_user_only && - kvm_x86_call(get_cpl)(vcpu) == 0) + if (!vcpu->arch.apf.send_always && + (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu))) return false; if (is_guest_mode(vcpu)) { @@ -13474,7 +13476,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, } vcpu->arch.apf.halted = false; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 91e50a513100..5c1fd5230cee 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -121,6 +121,13 @@ static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu) return vcpu->arch.last_vmentry_cpu != -1; } +static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) +{ + vcpu->arch.mp_state = mp_state; + if (mp_state == KVM_MP_STATE_RUNNABLE) + vcpu->arch.pv.pv_unhalted = false; +} + static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu) { return vcpu->arch.exception.pending || diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index a909b817b9c0..f65ca27888e9 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1480,7 +1480,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, set_bit(vcpu->vcpu_idx, vcpu->kvm->arch.xen.poll_mask); if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + kvm_set_mp_state(vcpu, KVM_MP_STATE_HALTED); if (sched_poll.timeout) mod_timer(&vcpu->arch.xen.poll_timer, @@ -1491,7 +1491,7 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, if (sched_poll.timeout) del_timer(&vcpu->arch.xen.poll_timer); - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); } vcpu->arch.xen.poll_evtchn = 0; diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 4277b983cace..f773f8f99249 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -69,6 +69,7 @@ TEST_GEN_PROGS_x86 += x86/hyperv_tlb_flush TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test +TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test diff --git a/tools/testing/selftests/kvm/x86/nested_emulation_test.c b/tools/testing/selftests/kvm/x86/nested_emulation_test.c new file mode 100644 index 000000000000..abc824dba04f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_emulation_test.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" +#include "svm_util.h" + +enum { + SVM_F, + VMX_F, + NR_VIRTUALIZATION_FLAVORS, +}; + +struct emulated_instruction { + const char name[32]; + uint8_t opcode[15]; + uint32_t exit_reason[NR_VIRTUALIZATION_FLAVORS]; +}; + +static struct emulated_instruction instructions[] = { + { + .name = "pause", + .opcode = { 0xf3, 0x90 }, + .exit_reason = { SVM_EXIT_PAUSE, + EXIT_REASON_PAUSE_INSTRUCTION, } + }, + { + .name = "hlt", + .opcode = { 0xf4 }, + .exit_reason = { SVM_EXIT_HLT, + EXIT_REASON_HLT, } + }, +}; + +static uint8_t kvm_fep[] = { 0x0f, 0x0b, 0x6b, 0x76, 0x6d }; /* ud2 ; .ascii "kvm" */ +static uint8_t l2_guest_code[sizeof(kvm_fep) + 15]; +static uint8_t *l2_instruction = &l2_guest_code[sizeof(kvm_fep)]; + +static uint32_t get_instruction_length(struct emulated_instruction *insn) +{ + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(insn->opcode) && insn->opcode[i]; i++) + ; + + return i; +} + +static void guest_code(void *test_data) +{ + int f = this_cpu_has(X86_FEATURE_SVM) ? SVM_F : VMX_F; + int i; + + memcpy(l2_guest_code, kvm_fep, sizeof(kvm_fep)); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + generic_svm_setup(svm, NULL, NULL); + vmcb->save.idtr.limit = 0; + vmcb->save.rip = (u64)l2_guest_code; + + vmcb->control.intercept |= BIT_ULL(INTERCEPT_SHUTDOWN) | + BIT_ULL(INTERCEPT_PAUSE) | + BIT_ULL(INTERCEPT_HLT); + vmcb->control.intercept_exceptions = 0; + } else { + GUEST_ASSERT(prepare_for_vmx_operation(test_data)); + GUEST_ASSERT(load_vmcs(test_data)); + + prepare_vmcs(test_data, NULL, NULL); + GUEST_ASSERT(!vmwrite(GUEST_IDTR_LIMIT, 0)); + GUEST_ASSERT(!vmwrite(GUEST_RIP, (u64)l2_guest_code)); + GUEST_ASSERT(!vmwrite(EXCEPTION_BITMAP, 0)); + + vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmreadz(CPU_BASED_VM_EXEC_CONTROL) | + CPU_BASED_PAUSE_EXITING | + CPU_BASED_HLT_EXITING); + } + + for (i = 0; i < ARRAY_SIZE(instructions); i++) { + struct emulated_instruction *insn = &instructions[i]; + uint32_t insn_len = get_instruction_length(insn); + uint32_t exit_insn_len; + u32 exit_reason; + + /* + * Copy the target instruction to the L2 code stream, and fill + * the remaining bytes with INT3s so that a missed intercept + * results in a consistent failure mode (SHUTDOWN). + */ + memcpy(l2_instruction, insn->opcode, insn_len); + memset(l2_instruction + insn_len, 0xcc, sizeof(insn->opcode) - insn_len); + + if (f == SVM_F) { + struct svm_test_data *svm = test_data; + struct vmcb *vmcb = svm->vmcb; + + run_guest(vmcb, svm->vmcb_gpa); + exit_reason = vmcb->control.exit_code; + exit_insn_len = vmcb->control.next_rip - vmcb->save.rip; + GUEST_ASSERT_EQ(vmcb->save.rip, (u64)l2_instruction); + } else { + GUEST_ASSERT_EQ(i ? vmresume() : vmlaunch(), 0); + exit_reason = vmreadz(VM_EXIT_REASON); + exit_insn_len = vmreadz(VM_EXIT_INSTRUCTION_LEN); + GUEST_ASSERT_EQ(vmreadz(GUEST_RIP), (u64)l2_instruction); + } + + __GUEST_ASSERT(exit_reason == insn->exit_reason[f], + "Wanted exit_reason '0x%x' for '%s', got '0x%x'", + insn->exit_reason[f], insn->name, exit_reason); + + __GUEST_ASSERT(exit_insn_len == insn_len, + "Wanted insn_len '%u' for '%s', got '%u'", + insn_len, insn->name, exit_insn_len); + } + + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t nested_test_data_gva; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(is_forced_emulation_enabled); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM) || kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vm_enable_cap(vm, KVM_CAP_EXCEPTION_PAYLOAD, -2ul); + + if (kvm_cpu_has(X86_FEATURE_SVM)) + vcpu_alloc_svm(vm, &nested_test_data_gva); + else + vcpu_alloc_vmx(vm, &nested_test_data_gva); + + vcpu_args_set(vcpu, 1, nested_test_data_gva); + + vcpu_run(vcpu); + TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_DONE); + + kvm_vm_free(vm); +} |