summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c2367
1 files changed, 1381 insertions, 986 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2e713480933a..0c6d899d53dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -59,7 +59,6 @@
#include <linux/sched/stat.h>
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
-#include <linux/entry-kvm.h>
#include <linux/suspend.h>
#include <linux/smp.h>
@@ -90,7 +89,6 @@
#include "trace.h"
#define MAX_IO_MSRS 256
-#define KVM_MAX_MCE_BANKS 32
/*
* Note, kvm_caps fields should *never* have default values, all fields must be
@@ -98,10 +96,10 @@
* vendor module being reloaded with different module parameters.
*/
struct kvm_caps kvm_caps __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_caps);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_caps);
struct kvm_host_values kvm_host __read_mostly;
-EXPORT_SYMBOL_GPL(kvm_host);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_host);
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
@@ -119,8 +117,6 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-
#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
@@ -139,6 +135,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static DEFINE_MUTEX(vendor_module_lock);
+static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
+static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -155,24 +154,18 @@ module_param(ignore_msrs, bool, 0644);
bool __read_mostly report_ignored_msrs = true;
module_param(report_ignored_msrs, bool, 0644);
-EXPORT_SYMBOL_GPL(report_ignored_msrs);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(report_ignored_msrs);
unsigned int min_timer_period_us = 200;
module_param(min_timer_period_us, uint, 0644);
-static bool __read_mostly kvmclock_periodic_sync = true;
-module_param(kvmclock_periodic_sync, bool, 0444);
-
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, 0644);
-static bool __read_mostly vector_hashing = true;
-module_param(vector_hashing, bool, 0444);
-
bool __read_mostly enable_vmware_backdoor = false;
module_param(enable_vmware_backdoor, bool, 0444);
-EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_vmware_backdoor);
/*
* Flags to manipulate forced emulation behavior (any non-zero value will
@@ -187,7 +180,7 @@ module_param(pi_inject_timer, bint, 0644);
/* Enable/disable PMU virtualization */
bool __read_mostly enable_pmu = true;
-EXPORT_SYMBOL_GPL(enable_pmu);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu);
module_param(enable_pmu, bool, 0444);
bool __read_mostly eager_page_split = true;
@@ -214,20 +207,35 @@ struct kvm_user_return_msrs {
};
u32 __read_mostly kvm_nr_uret_msrs;
-EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_nr_uret_msrs);
static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
-static struct kvm_user_return_msrs __percpu *user_return_msrs;
+static DEFINE_PER_CPU(struct kvm_user_return_msrs, user_return_msrs);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
+#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL)
+/*
+ * Note, KVM supports exposing PT to the guest, but does not support context
+ * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping
+ * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support
+ * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs).
+ */
+#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL)
+
bool __read_mostly allow_smaller_maxphyaddr = 0;
-EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(allow_smaller_maxphyaddr);
bool __read_mostly enable_apicv = true;
-EXPORT_SYMBOL_GPL(enable_apicv);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_apicv);
+
+bool __read_mostly enable_ipiv = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_ipiv);
+
+bool __read_mostly enable_device_posted_irqs = true;
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_device_posted_irqs);
const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
KVM_GENERIC_VM_STATS(),
@@ -332,7 +340,11 @@ static const u32 msrs_to_save_base[] = {
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
MSR_IA32_UMWAIT_CONTROL,
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS,
+
+ MSR_IA32_U_CET, MSR_IA32_S_CET,
+ MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP,
+ MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB,
};
static const u32 msrs_to_save_pmu[] = {
@@ -364,6 +376,7 @@ static const u32 msrs_to_save_pmu[] = {
MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
@@ -559,28 +572,30 @@ static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apf.gfns[i] = ~0;
}
+static void kvm_destroy_user_return_msrs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ WARN_ON_ONCE(per_cpu(user_return_msrs, cpu).registered);
+
+ kvm_nr_uret_msrs = 0;
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
struct kvm_user_return_msrs *msrs
= container_of(urn, struct kvm_user_return_msrs, urn);
struct kvm_user_return_msr_values *values;
- unsigned long flags;
- /*
- * Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_disable_virtualization_cpu()
- */
- local_irq_save(flags);
- if (msrs->registered) {
- msrs->registered = false;
- user_return_notifier_unregister(urn);
- }
- local_irq_restore(flags);
+ msrs->registered = false;
+ user_return_notifier_unregister(urn);
+
for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(kvm_uret_msrs_list[slot], values->host);
+ wrmsrq(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host;
}
}
@@ -592,10 +607,10 @@ static int kvm_probe_user_return_msr(u32 msr)
int ret;
preempt_disable();
- ret = rdmsrl_safe(msr, &val);
+ ret = rdmsrq_safe(msr, &val);
if (ret)
goto out;
- ret = wrmsrl_safe(msr, val);
+ ret = wrmsrq_safe(msr, val);
out:
preempt_enable();
return ret;
@@ -611,7 +626,7 @@ int kvm_add_user_return_msr(u32 msr)
kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
return kvm_nr_uret_msrs++;
}
-EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_add_user_return_msr);
int kvm_find_user_return_msr(u32 msr)
{
@@ -623,46 +638,57 @@ int kvm_find_user_return_msr(u32 msr)
}
return -1;
}
-EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
u64 value;
int i;
for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- rdmsrl_safe(kvm_uret_msrs_list[i], &value);
+ rdmsrq_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value;
msrs->values[i].curr = value;
}
}
+static void kvm_user_return_register_notifier(struct kvm_user_return_msrs *msrs)
+{
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
+ }
+}
+
int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
int err;
value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
+ err = wrmsrq_safe(kvm_uret_msrs_list[slot], value);
if (err)
return 1;
msrs->values[slot].curr = value;
- if (!msrs->registered) {
- msrs->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&msrs->urn);
- msrs->registered = true;
- }
+ kvm_user_return_register_notifier(msrs);
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_user_return_msr);
+
+u64 kvm_get_user_return_msr(unsigned int slot)
+{
+ return this_cpu_ptr(&user_return_msrs)->values[slot].curr;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_user_return_msr);
static void drop_user_return_notifiers(void)
{
- struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(&user_return_msrs);
if (msrs->registered)
kvm_on_user_return(&msrs->urn);
@@ -680,7 +706,7 @@ noinstr void kvm_spurious_fault(void)
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
}
-EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spurious_fault);
#define EXCPT_BENIGN 0
#define EXCPT_CONTRIBUTORY 1
@@ -785,7 +811,7 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
ex->has_payload = false;
ex->payload = 0;
}
-EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_deliver_exception_payload);
static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
bool has_error_code, u32 error_code,
@@ -802,9 +828,9 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
-static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
- unsigned nr, bool has_error, u32 error_code,
- bool has_payload, unsigned long payload, bool reinject)
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error, u32 error_code,
+ bool has_payload, unsigned long payload)
{
u32 prev_nr;
int class1, class2;
@@ -812,13 +838,10 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
- * If the exception is destined for L2 and isn't being reinjected,
- * morph it to a VM-Exit if L1 wants to intercept the exception. A
- * previously injected exception is not checked because it was checked
- * when it was original queued, and re-checking is incorrect if _L1_
- * injected the exception, in which case it's exempt from interception.
+ * If the exception is destined for L2, morph it to a VM-Exit if L1
+ * wants to intercept the exception.
*/
- if (!reinject && is_guest_mode(vcpu) &&
+ if (is_guest_mode(vcpu) &&
kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
has_payload, payload);
@@ -827,28 +850,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (reinject) {
- /*
- * On VM-Entry, an exception can be pending if and only
- * if event injection was blocked by nested_run_pending.
- * In that case, however, vcpu_enter_guest() requests an
- * immediate exit, and the guest shouldn't proceed far
- * enough to need reinjection.
- */
- WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
- vcpu->arch.exception.injected = true;
- if (WARN_ON_ONCE(has_payload)) {
- /*
- * A reinjected event has already
- * delivered its payload.
- */
- has_payload = false;
- payload = 0;
- }
- } else {
- vcpu->arch.exception.pending = true;
- vcpu->arch.exception.injected = false;
- }
+ vcpu->arch.exception.pending = true;
+ vcpu->arch.exception.injected = false;
+
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.vector = nr;
vcpu->arch.exception.error_code = error_code;
@@ -889,30 +893,53 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0);
}
-EXPORT_SYMBOL_GPL(kvm_queue_exception);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception);
-void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
-{
- kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
-}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception);
void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
unsigned long payload)
{
- kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload);
}
-EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_p);
static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
u32 error_code, unsigned long payload)
{
- kvm_multiple_exception(vcpu, nr, true, error_code,
- true, payload, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code, true, payload);
}
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned int nr,
+ bool has_error_code, u32 error_code)
+{
+
+ /*
+ * On VM-Entry, an exception can be pending if and only if event
+ * injection was blocked by nested_run_pending. In that case, however,
+ * vcpu_enter_guest() requests an immediate exit, and the guest
+ * shouldn't proceed far enough to need reinjection.
+ */
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
+
+ /*
+ * Do not check for interception when injecting an event for L2, as the
+ * exception was checked for intercept when it was original queued, and
+ * re-checking is incorrect if _L1_ injected the exception, in which
+ * case it's exempt from interception.
+ */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ vcpu->arch.exception.injected = true;
+ vcpu->arch.exception.has_error_code = has_error_code;
+ vcpu->arch.exception.vector = nr;
+ vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_requeue_exception);
+
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
if (err)
@@ -922,7 +949,7 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
return 1;
}
-EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_complete_insn_gp);
static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
{
@@ -972,7 +999,7 @@ void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
fault_mmu->inject_page_fault(vcpu, fault);
}
-EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_emulated_page_fault);
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
@@ -982,15 +1009,9 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu)
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
-}
-EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
-
-void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
-{
- kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0);
}
-EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_queue_exception_e);
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
@@ -1012,7 +1033,14 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
kvm_queue_exception(vcpu, UD_VECTOR);
return false;
}
-EXPORT_SYMBOL_GPL(kvm_require_dr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_require_dr);
+
+static bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
+}
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
@@ -1067,7 +1095,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
return 1;
}
-EXPORT_SYMBOL_GPL(load_pdptrs);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(load_pdptrs);
static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
@@ -1106,21 +1134,26 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
}
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_async_pf_hash_reset(vcpu);
-
/*
* Clearing CR0.PG is defined to flush the TLB from the guest's
* perspective.
*/
if (!(cr0 & X86_CR0_PG))
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ /*
+ * Check for async #PF completion events when enabling paging,
+ * as the vCPU may have previously encountered async #PFs (it's
+ * entirely legal for the guest to toggle paging on/off without
+ * waiting for the async #PF queue to drain).
+ */
+ else if (kvm_pv_async_pf_enabled(vcpu))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
}
if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr0);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
@@ -1155,44 +1188,53 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
(is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)))
return 1;
+ if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET))
+ return 1;
+
kvm_x86_call(set_cr0)(vcpu, cr0);
kvm_post_set_cr0(vcpu, old_cr0, cr0);
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr0);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
-EXPORT_SYMBOL_GPL(kvm_lmsw);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
-void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_xfeatures(struct kvm_vcpu *vcpu, bool load_guest)
{
if (vcpu->arch.guest_state_protected)
return;
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
+ if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE))
+ return;
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ if (vcpu->arch.xcr0 != kvm_host.xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK,
+ load_guest ? vcpu->arch.xcr0 : kvm_host.xcr0);
- if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
- }
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
+ vcpu->arch.ia32_xss != kvm_host.xss)
+ wrmsrq(MSR_IA32_XSS, load_guest ? vcpu->arch.ia32_xss : kvm_host.xss);
+}
+
+static void kvm_load_guest_pkru(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.guest_state_protected)
+ return;
if (cpu_feature_enabled(X86_FEATURE_PKU) &&
vcpu->arch.pkru != vcpu->arch.host_pkru &&
((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE)))
- write_pkru(vcpu->arch.pkru);
+ wrpkru(vcpu->arch.pkru);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
-void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
+static void kvm_load_host_pkru(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
return;
@@ -1202,21 +1244,9 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
- write_pkru(vcpu->arch.host_pkru);
- }
-
- if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
-
- if (vcpu->arch.xcr0 != kvm_host.xcr0)
- xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
-
- if (guest_can_use(vcpu, X86_FEATURE_XSAVES) &&
- vcpu->arch.ia32_xss != kvm_host.xss)
- wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+ wrpkru(vcpu->arch.host_pkru);
}
-
}
-EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
#ifdef CONFIG_X86_64
static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
@@ -1225,7 +1255,7 @@ static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
}
#endif
-static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1266,9 +1296,10 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
return 0;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_set_xcr);
int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
{
@@ -1281,19 +1312,7 @@ int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
-
-bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- if (cr4 & cr4_reserved_bits)
- return false;
-
- if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
- return false;
-
- return true;
-}
-EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_xsetbv);
static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1341,7 +1360,7 @@ void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned lon
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_post_set_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -1366,13 +1385,16 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
}
+ if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP))
+ return 1;
+
kvm_x86_call(set_cr4)(vcpu, cr4);
kvm_post_set_cr4(vcpu, old_cr4, cr4);
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr4);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr4);
static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
{
@@ -1464,7 +1486,7 @@ handle_tlb_flush:
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr3);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr3);
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
@@ -1476,7 +1498,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_cr8);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
@@ -1485,7 +1507,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
else
return vcpu->arch.cr8;
}
-EXPORT_SYMBOL_GPL(kvm_get_cr8);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_cr8);
static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
{
@@ -1510,16 +1532,16 @@ void kvm_update_dr7(struct kvm_vcpu *vcpu)
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
-EXPORT_SYMBOL_GPL(kvm_update_dr7);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_update_dr7);
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
u64 fixed = DR6_FIXED_1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1551,7 +1573,7 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_dr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_dr);
unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
{
@@ -1568,14 +1590,14 @@ unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr)
return vcpu->arch.dr7;
}
}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dr);
int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
{
- u32 ecx = kvm_rcx_read(vcpu);
+ u32 pmc = kvm_rcx_read(vcpu);
u64 data;
- if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
+ if (kvm_pmu_rdpmc(vcpu, pmc, &data)) {
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -1584,7 +1606,7 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
kvm_rdx_write(vcpu, data >> 32);
return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdpmc);
/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
@@ -1603,7 +1625,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \
- ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO)
+ ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1637,6 +1659,8 @@ static u64 kvm_get_arch_capabilities(void)
data |= ARCH_CAP_MDS_NO;
if (!boot_cpu_has_bug(X86_BUG_RFDS))
data |= ARCH_CAP_RFDS_NO;
+ if (!boot_cpu_has_bug(X86_BUG_ITS))
+ data |= ARCH_CAP_ITS_NO;
if (!boot_cpu_has(X86_FEATURE_RTM)) {
/*
@@ -1679,7 +1703,7 @@ static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
*data = MSR_PLATFORM_INFO_CPUID_FAULT;
break;
case MSR_IA32_UCODE_REV:
- rdmsrl_safe(index, data);
+ rdmsrq_safe(index, data);
break;
default:
return kvm_x86_call(get_feature_msr)(index, data);
@@ -1695,20 +1719,20 @@ static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
+ if (efer & EFER_AUTOIBRS && !guest_cpu_cap_has(vcpu, X86_FEATURE_AUTOIBRS))
return false;
- if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
+ if (efer & EFER_FFXSR && !guest_cpu_cap_has(vcpu, X86_FEATURE_FXSR_OPT))
return false;
- if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ if (efer & EFER_SVME && !guest_cpu_cap_has(vcpu, X86_FEATURE_SVM))
return false;
if (efer & (EFER_LME | EFER_LMA) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
return false;
- if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
+ if (efer & EFER_NX && !guest_cpu_cap_has(vcpu, X86_FEATURE_NX))
return false;
return true;
@@ -1721,7 +1745,7 @@ bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
return __kvm_valid_efer(vcpu, efer);
}
-EXPORT_SYMBOL_GPL(kvm_valid_efer);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_valid_efer);
static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
@@ -1764,7 +1788,7 @@ void kvm_enable_efer_bits(u64 mask)
{
efer_reserved_bits &= ~mask;
}
-EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_efer_bits);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
{
@@ -1807,7 +1831,7 @@ out:
return allowed;
}
-EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_msr_allowed);
/*
* Write @data into the MSR specified by @index. Select MSR specific fault
@@ -1850,8 +1874,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
return 1;
if (!host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
return 1;
/*
@@ -1868,6 +1892,44 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
data = (u32)data;
break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (!kvm_is_valid_u_s_cet(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ /*
+ * Note that the MSR emulation here is flawed when a vCPU
+ * doesn't support the Intel 64 architecture. The expected
+ * architectural behavior in this case is that the upper 32
+ * bits do not exist and should always read '0'. However,
+ * because the actual hardware on which the virtual CPU is
+ * running does support Intel 64, XRSTORS/XSAVES in the
+ * guest could observe behavior that violates the
+ * architecture. Intercepting XRSTORS/XSAVES for this
+ * special case isn't deemed worthwhile.
+ */
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ /*
+ * MSR_IA32_INT_SSP_TAB is not present on processors that do
+ * not support Intel 64 architecture.
+ */
+ if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM))
+ return KVM_MSR_RET_UNSUPPORTED;
+ if (is_noncanonical_msr_address(data, vcpu))
+ return 1;
+ /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */
+ if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4))
+ return 1;
+ break;
}
msr.data = data;
@@ -1896,8 +1958,8 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
+static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
struct msr_data msr;
int ret;
@@ -1908,10 +1970,24 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
return 1;
if (!host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID))
return 1;
break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
+ case MSR_KVM_INTERNAL_GUEST_SSP:
+ if (!host_initiated)
+ return 1;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB:
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return KVM_MSR_RET_UNSUPPORTED;
+ break;
}
msr.index = index;
@@ -1923,6 +1999,16 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
return ret;
}
+int kvm_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return __kvm_set_msr(vcpu, index, data, true);
+}
+
+int kvm_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return __kvm_get_msr(vcpu, index, data, true);
+}
+
static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 *data, bool host_initiated)
{
@@ -1930,33 +2016,36 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
__kvm_get_msr);
}
-int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+int __kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
- return KVM_MSR_RET_FILTERED;
return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
-EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_read);
-int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int __kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
- if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
- return KVM_MSR_RET_FILTERED;
return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
-EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_emulate_msr_write);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+int kvm_emulate_msr_read(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- return kvm_get_msr_ignored_check(vcpu, index, data, false);
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_read(vcpu, index, data);
}
-EXPORT_SYMBOL_GPL(kvm_get_msr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_read);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_emulate_msr_write(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
- return kvm_set_msr_ignored_check(vcpu, index, data, false);
+ if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+
+ return __kvm_emulate_msr_write(vcpu, index, data);
}
-EXPORT_SYMBOL_GPL(kvm_set_msr);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_msr_write);
+
static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
{
@@ -1988,6 +2077,15 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
return complete_fast_msr_access(vcpu);
}
+static int complete_fast_rdmsr_imm(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->run->msr.error)
+ kvm_register_write(vcpu, vcpu->arch.cui_rdmsr_imm_reg,
+ vcpu->run->msr.data);
+
+ return complete_fast_msr_access(vcpu);
+}
+
static u64 kvm_msr_reason(int r)
{
switch (r) {
@@ -2022,55 +2120,82 @@ static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
return 1;
}
-int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+static int __kvm_emulate_rdmsr(struct kvm_vcpu *vcpu, u32 msr, int reg,
+ int (*complete_rdmsr)(struct kvm_vcpu *))
{
- u32 ecx = kvm_rcx_read(vcpu);
u64 data;
int r;
- r = kvm_get_msr_with_filter(vcpu, ecx, &data);
+ r = kvm_emulate_msr_read(vcpu, msr, &data);
if (!r) {
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(msr, data);
- kvm_rax_write(vcpu, data & -1u);
- kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ if (reg < 0) {
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ } else {
+ kvm_register_write(vcpu, reg, data);
+ }
} else {
/* MSR read failed? See if we should ask user space */
- if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
- complete_fast_rdmsr, r))
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_RDMSR, 0,
+ complete_rdmsr, r))
return 0;
- trace_kvm_msr_read_ex(ecx);
+ trace_kvm_msr_read_ex(msr);
}
return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
-int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
- u32 ecx = kvm_rcx_read(vcpu);
- u64 data = kvm_read_edx_eax(vcpu);
- int r;
+ return __kvm_emulate_rdmsr(vcpu, kvm_rcx_read(vcpu), -1,
+ complete_fast_rdmsr);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr);
+
+int kvm_emulate_rdmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ vcpu->arch.cui_rdmsr_imm_reg = reg;
- r = kvm_set_msr_with_filter(vcpu, ecx, data);
+ return __kvm_emulate_rdmsr(vcpu, msr, reg, complete_fast_rdmsr_imm);
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_rdmsr_imm);
+
+static int __kvm_emulate_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int r;
+ r = kvm_emulate_msr_write(vcpu, msr, data);
if (!r) {
- trace_kvm_msr_write(ecx, data);
+ trace_kvm_msr_write(msr, data);
} else {
/* MSR write failed? See if we should ask user space */
- if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
+ if (kvm_msr_user_space(vcpu, msr, KVM_EXIT_X86_WRMSR, data,
complete_fast_msr_access, r))
return 0;
/* Signal all other negative errors to userspace */
if (r < 0)
return r;
- trace_kvm_msr_write_ex(ecx, data);
+ trace_kvm_msr_write_ex(msr, data);
}
return kvm_x86_call(complete_emulated_msr)(vcpu, r);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_wrmsr(vcpu, kvm_rcx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr);
+
+int kvm_emulate_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __kvm_emulate_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wrmsr_imm);
int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
{
@@ -2082,22 +2207,41 @@ int kvm_emulate_invd(struct kvm_vcpu *vcpu)
/* Treat an INVD instruction as a NOP and just skip it. */
return kvm_emulate_as_nop(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_invd);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd);
+
+fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_emulate_invd(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_invd);
int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
{
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
-EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invalid_op);
static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
{
- if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
+ bool enabled;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS))
+ goto emulate_as_nop;
+
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT))
+ enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_MWAIT);
+ else
+ enabled = vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT;
+
+ if (!enabled)
return kvm_handle_invalid_op(vcpu);
+emulate_as_nop:
pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
@@ -2105,13 +2249,13 @@ int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
{
return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
}
-EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_mwait);
int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
{
return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
}
-EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_monitor);
static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
{
@@ -2121,74 +2265,41 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending();
}
-/*
- * The fast path for frequent and performance sensitive wrmsr emulation,
- * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
- * the latency of virtual IPI by avoiding the expensive bits of transitioning
- * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
- * other cases which must be called after interrupts are enabled on the host.
- */
-static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
- if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
- return 1;
-
- if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
- ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
- ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
- ((u32)(data >> 32) != X2APIC_BROADCAST))
- return kvm_x2apic_icr_write(vcpu->arch.apic, data);
-
- return 1;
-}
-
-static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
-{
- if (!kvm_can_use_hv_timer(vcpu))
- return 1;
-
- kvm_set_lapic_tscdeadline_msr(vcpu, data);
- return 0;
-}
-
-fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
-{
- u32 msr = kvm_rcx_read(vcpu);
- u64 data;
- fastpath_t ret;
- bool handled;
-
- kvm_vcpu_srcu_read_lock(vcpu);
-
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
- data = kvm_read_edx_eax(vcpu);
- handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) ||
+ kvm_x2apic_icr_write_fast(vcpu->arch.apic, data))
+ return EXIT_FASTPATH_NONE;
break;
case MSR_IA32_TSC_DEADLINE:
- data = kvm_read_edx_eax(vcpu);
- handled = !handle_fastpath_set_tscdeadline(vcpu, data);
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
default:
- handled = false;
- break;
+ return EXIT_FASTPATH_NONE;
}
- if (handled) {
- if (!kvm_skip_emulated_instruction(vcpu))
- ret = EXIT_FASTPATH_EXIT_USERSPACE;
- else
- ret = EXIT_FASTPATH_REENTER_GUEST;
- trace_kvm_msr_write(msr, data);
- } else {
- ret = EXIT_FASTPATH_NONE;
- }
+ trace_kvm_msr_write(msr, data);
- kvm_vcpu_srcu_read_unlock(vcpu);
+ if (!kvm_skip_emulated_instruction(vcpu))
+ return EXIT_FASTPATH_EXIT_USERSPACE;
- return ret;
+ return EXIT_FASTPATH_REENTER_GUEST;
+}
+
+fastpath_t handle_fastpath_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return __handle_fastpath_wrmsr(vcpu, kvm_rcx_read(vcpu),
+ kvm_read_edx_eax(vcpu));
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr);
+
+fastpath_t handle_fastpath_wrmsr_imm(struct kvm_vcpu *vcpu, u32 msr, int reg)
+{
+ return __handle_fastpath_wrmsr(vcpu, msr, kvm_register_read(vcpu, reg));
}
-EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_wrmsr_imm);
/*
* Adapt set_msr() to msr_io()'s calling convention
@@ -2554,7 +2665,7 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return vcpu->arch.l1_tsc_offset +
kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
-EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_l1_tsc);
u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
{
@@ -2569,7 +2680,7 @@ u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
nested_offset += l2_offset;
return nested_offset;
}
-EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_offset);
u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
{
@@ -2579,10 +2690,13 @@ u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
return l1_multiplier;
}
-EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_calc_nested_tsc_multiplier);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
{
+ if (vcpu->arch.guest_tsc_protected)
+ return;
+
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vcpu->arch.l1_tsc_offset,
l1_offset);
@@ -2640,12 +2754,18 @@ static inline bool kvm_check_tsc_unstable(void)
* participates in.
*/
static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
- u64 ns, bool matched)
+ u64 ns, bool matched, bool user_set_tsc)
{
struct kvm *kvm = vcpu->kvm;
lockdep_assert_held(&kvm->arch.tsc_write_lock);
+ if (vcpu->arch.guest_tsc_protected)
+ return;
+
+ if (user_set_tsc)
+ vcpu->kvm->arch.user_set_tsc = true;
+
/*
* We also track th most recent recorded KHZ, write and time to
* allow the matching interval to be extended at each write.
@@ -2731,8 +2851,6 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
}
}
- if (user_value)
- kvm->arch.user_set_tsc = true;
/*
* For a reliable TSC, we can match TSC offsets, and for an unstable
@@ -2752,7 +2870,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value)
matched = true;
}
- __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched, !!user_value);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
}
@@ -3130,15 +3248,17 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return data.clock;
}
-static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+static void kvm_setup_guest_pvclock(struct pvclock_vcpu_time_info *ref_hv_clock,
+ struct kvm_vcpu *vcpu,
struct gfn_to_pfn_cache *gpc,
- unsigned int offset,
- bool force_tsc_unstable)
+ unsigned int offset)
{
- struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info *guest_hv_clock;
+ struct pvclock_vcpu_time_info hv_clock;
unsigned long flags;
+ memcpy(&hv_clock, ref_hv_clock, sizeof(hv_clock));
+
read_lock_irqsave(&gpc->lock, flags);
while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
read_unlock_irqrestore(&gpc->lock, flags);
@@ -3158,52 +3278,34 @@ static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
* it is consistent.
*/
- guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
+ guest_hv_clock->version = hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
+ hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
- memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
-
- if (force_tsc_unstable)
- guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT;
+ memcpy(guest_hv_clock, &hv_clock, sizeof(*guest_hv_clock));
smp_wmb();
- guest_hv_clock->version = ++vcpu->hv_clock.version;
+ guest_hv_clock->version = ++hv_clock.version;
kvm_gpc_mark_dirty_in_slot(gpc);
read_unlock_irqrestore(&gpc->lock, flags);
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+ trace_kvm_pvclock_update(vcpu->vcpu_id, &hv_clock);
}
-static int kvm_guest_time_update(struct kvm_vcpu *v)
+int kvm_guest_time_update(struct kvm_vcpu *v)
{
+ struct pvclock_vcpu_time_info hv_clock = {};
unsigned long flags, tgt_tsc_khz;
unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- u8 pvclock_flags;
bool use_master_clock;
-#ifdef CONFIG_KVM_XEN
- /*
- * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
- * explicitly told to use TSC as its clocksource Xen will not set this bit.
- * This default behaviour led to bugs in some guest kernels which cause
- * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
- */
- bool xen_pvclock_tsc_unstable =
- ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
-#endif
kernel_ns = 0;
host_tsc = 0;
@@ -3258,41 +3360,65 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
- if (kvm_caps.has_tsc_control)
+ if (kvm_caps.has_tsc_control) {
tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
v->arch.l1_tsc_scaling_ratio);
+ tgt_tsc_khz = tgt_tsc_khz ? : 1;
+ }
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
- &vcpu->hv_clock.tsc_shift,
- &vcpu->hv_clock.tsc_to_system_mul);
+ &vcpu->pvclock_tsc_shift,
+ &vcpu->pvclock_tsc_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
- kvm_xen_update_tsc_info(v);
}
- vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
- vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+ hv_clock.tsc_shift = vcpu->pvclock_tsc_shift;
+ hv_clock.tsc_to_system_mul = vcpu->pvclock_tsc_mul;
+ hv_clock.tsc_timestamp = tsc_timestamp;
+ hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
/* If the host uses TSC clocksource, then it is stable */
- pvclock_flags = 0;
+ hv_clock.flags = 0;
if (use_master_clock)
- pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+ hv_clock.flags |= PVCLOCK_TSC_STABLE_BIT;
- vcpu->hv_clock.flags = pvclock_flags;
+ if (vcpu->pv_time.active) {
+ /*
+ * GUEST_STOPPED is only supported by kvmclock, and KVM's
+ * historic behavior is to only process the request if kvmclock
+ * is active/enabled.
+ */
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->pv_time, 0);
+
+ hv_clock.flags &= ~PVCLOCK_GUEST_STOPPED;
+ }
+
+ kvm_hv_setup_tsc_page(v->kvm, &hv_clock);
- if (vcpu->pv_time.active)
- kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0, false);
#ifdef CONFIG_KVM_XEN
+ /*
+ * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless
+ * explicitly told to use TSC as its clocksource Xen will not set this bit.
+ * This default behaviour led to bugs in some guest kernels which cause
+ * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags.
+ *
+ * Note! Clear TSC_STABLE only for Xen clocks, i.e. the order matters!
+ */
+ if (ka->xen.hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
+ hv_clock.flags &= ~PVCLOCK_TSC_STABLE_BIT;
+
if (vcpu->xen.vcpu_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time),
- xen_pvclock_tsc_unstable);
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_cache.active)
- kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0,
- xen_pvclock_tsc_unstable);
+ kvm_setup_guest_pvclock(&hv_clock, v, &vcpu->xen.vcpu_time_info_cache, 0);
#endif
- kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -3375,27 +3501,17 @@ uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm)
/*
* kvmclock updates which are isolated to a given vcpu, such as
* vcpu->cpu migration, should not allow system_timestamp from
- * the rest of the vcpus to remain static. Otherwise ntp frequency
- * correction applies to one vcpu's system_timestamp but not
- * the others.
+ * the rest of the vcpus to remain static.
*
* So in those cases, request a kvmclock update for all vcpus.
- * We need to rate-limit these requests though, as they can
- * considerably slow guests that have a large number of vcpus.
- * The time for a remote vcpu to update its kvmclock is bound
- * by the delay we use to rate-limit the updates.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
*/
-
-#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
-
-static void kvmclock_update_fn(struct work_struct *work)
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
unsigned long i;
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_update_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
struct kvm_vcpu *vcpu;
+ struct kvm *kvm = v->kvm;
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -3403,29 +3519,6 @@ static void kvmclock_update_fn(struct work_struct *work)
}
}
-static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
-{
- struct kvm *kvm = v->kvm;
-
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
- schedule_delayed_work(&kvm->arch.kvmclock_update_work,
- KVMCLOCK_UPDATE_DELAY);
-}
-
-#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
-
-static void kvmclock_sync_fn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
- kvmclock_sync_work);
- struct kvm *kvm = container_of(ka, struct kvm, arch);
-
- schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
-}
-
/* These helpers are safe iff @msr is known to be an MCx bank MSR. */
static bool is_mci_control_msr(u32 msr)
{
@@ -3520,13 +3613,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 0;
}
-static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
-{
- u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
-
- return (vcpu->arch.apf.msr_en_val & mask) == mask;
-}
-
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
@@ -3558,7 +3644,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
sizeof(u64)))
return 1;
- vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ vcpu->arch.apf.send_always = (data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
kvm_async_pf_wakeup_all(vcpu);
@@ -3642,7 +3728,7 @@ void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_service_local_tlb_flush_requests);
static void record_steal_time(struct kvm_vcpu *vcpu)
{
@@ -3742,12 +3828,78 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
+/*
+ * Returns true if the MSR in question is managed via XSTATE, i.e. is context
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
+ */
+static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (!vcpu)
+ return false;
+
+ switch (msr) {
+ case MSR_IA32_U_CET:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ||
+ guest_cpu_cap_has(vcpu, X86_FEATURE_IBT);
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
+ default:
+ return false;
+ }
+}
+
+/*
+ * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an
+ * MSR that is managed via XSTATE. Note, the caller is responsible for doing
+ * the initial FPU load, this helper only ensures that guest state is resident
+ * in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
+ */
+static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
+ struct msr_data *msr_info,
+ int access)
+{
+ BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W);
+
+ KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm);
+ KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm);
+
+ kvm_fpu_get();
+ if (access == MSR_TYPE_R)
+ rdmsrq(msr_info->index, msr_info->data);
+ else
+ wrmsrq(msr_info->index, msr_info->data);
+ kvm_fpu_put();
+}
+
+static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W);
+}
+
+static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R);
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u32 msr = msr_info->index;
u64 data = msr_info->data;
- if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
+ /*
+ * Do not allow host-initiated writes to trigger the Xen hypercall
+ * page setup; it could incur locking paths which are not expected
+ * if userspace sets the MSR in an unusual location.
+ */
+ if (kvm_xen_is_hypercall_page_msr(vcpu->kvm, msr) &&
+ !msr_info->host_initiated)
return kvm_xen_write_hypercall_page(vcpu, data);
switch (msr) {
@@ -3767,13 +3919,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_ARCH_CAPABILITIES:
if (!msr_info->host_initiated ||
- !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
return KVM_MSR_RET_UNSUPPORTED;
vcpu->arch.arch_capabilities = data;
break;
case MSR_IA32_PERF_CAPABILITIES:
if (!msr_info->host_initiated ||
- !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return KVM_MSR_RET_UNSUPPORTED;
if (data & ~kvm_caps.supported_perf_cap)
@@ -3797,11 +3949,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if ((!guest_has_pred_cmd_msr(vcpu)))
return 1;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_AMD_IBPB))
reserved_bits |= PRED_CMD_IBPB;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SBPB))
reserved_bits |= PRED_CMD_SBPB;
}
@@ -3817,12 +3969,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!data)
break;
- wrmsrl(MSR_IA32_PRED_CMD, data);
+ wrmsrq(MSR_IA32_PRED_CMD, data);
break;
}
case MSR_IA32_FLUSH_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D))
return 1;
if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH))
@@ -3830,7 +3982,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!data)
break;
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ wrmsrq(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
break;
case MSR_EFER:
return set_efer(vcpu, msr_info);
@@ -3873,7 +4025,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
- if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
+ if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
@@ -3900,10 +4052,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
} else {
vcpu->arch.ia32_misc_enable_msr = data;
}
@@ -3920,25 +4072,22 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC:
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, &data);
- } else {
+ } else if (!vcpu->arch.guest_tsc_protected) {
u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
vcpu->arch.ia32_tsc_adjust_msr += adj;
}
break;
case MSR_IA32_XSS:
- if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
- return 1;
- /*
- * KVM supports exposing PT to the guest, but does not support
- * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
- * XSAVES/XRSTORS to save/restore PT MSRs.
- */
- if (data & ~kvm_caps.supported_xss)
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return KVM_MSR_RET_UNSUPPORTED;
+
+ if (data & ~vcpu->arch.guest_supported_xss)
return 1;
+ if (vcpu->arch.ia32_xss == data)
+ break;
vcpu->arch.ia32_xss = data;
- kvm_update_cpuid_runtime(vcpu);
+ vcpu->arch.cpuid_dynamic_bits_dirty = true;
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -3989,7 +4138,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
return 1;
if (data & 0x1) {
- vcpu->arch.apf.pageready_pending = false;
+ /*
+ * Pairs with the smp_mb__after_atomic() in
+ * kvm_arch_async_page_present_queued().
+ */
+ smp_store_mb(vcpu->arch.apf.pageready_pending, false);
+
kvm_check_async_pf_completion(vcpu);
}
break;
@@ -4077,12 +4231,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.length = data;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
vcpu->arch.osvw.status = data;
break;
@@ -4101,7 +4255,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#ifdef CONFIG_X86_64
case MSR_IA32_XFD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
if (data & ~kvm_guest_supported_xfd(vcpu))
@@ -4111,7 +4265,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_XFD_ERR:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
if (data & ~kvm_guest_supported_xfd(vcpu))
@@ -4120,6 +4274,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.guest_fpu.xfd_err = data;
break;
#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_set_xstate_msr(vcpu, msr_info);
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
@@ -4128,7 +4286,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_msr_common);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_msr_common);
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
@@ -4226,12 +4384,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vcpu->arch.microcode_version;
break;
case MSR_IA32_ARCH_CAPABILITIES:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.arch_capabilities;
break;
case MSR_IA32_PERF_CAPABILITIES:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM))
return KVM_MSR_RET_UNSUPPORTED;
msr_info->data = vcpu->arch.perf_capabilities;
break;
@@ -4432,12 +4590,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
- if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_OSVW))
return 1;
msr_info->data = vcpu->arch.osvw.status;
break;
@@ -4456,19 +4614,23 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
#ifdef CONFIG_X86_64
case MSR_IA32_XFD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
break;
case MSR_IA32_XFD_ERR:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
+ !guest_cpu_cap_has(vcpu, X86_FEATURE_XFD))
return 1;
msr_info->data = vcpu->arch.guest_fpu.xfd_err;
break;
#endif
+ case MSR_IA32_U_CET:
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ kvm_get_xstate_msr(vcpu, msr_info);
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -4477,7 +4639,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_msr_common);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_msr_common);
/*
* Read or write a bunch of msrs. All parameters are kernel addresses.
@@ -4489,11 +4651,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
int (*do_msr)(struct kvm_vcpu *vcpu,
unsigned index, u64 *data))
{
+ bool fpu_loaded = false;
int i;
- for (i = 0; i < msrs->nmsrs; ++i)
+ for (i = 0; i < msrs->nmsrs; ++i) {
+ /*
+ * If userspace is accessing one or more XSTATE-managed MSRs,
+ * temporarily load the guest's FPU state so that the guest's
+ * MSR value(s) is resident in hardware and thus can be accessed
+ * via RDMSR/WRMSR.
+ */
+ if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) {
+ kvm_load_guest_fpu(vcpu);
+ fpu_loaded = true;
+ }
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
+ }
+ if (fpu_loaded)
+ kvm_put_guest_fpu(vcpu);
return i;
}
@@ -4545,6 +4721,23 @@ static inline bool kvm_can_mwait_in_guest(void)
boot_cpu_has(X86_FEATURE_ARAT);
}
+static u64 kvm_get_allowed_disable_exits(void)
+{
+ u64 r = KVM_X86_DISABLE_EXITS_PAUSE;
+
+ if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+ r |= KVM_X86_DISABLE_EXITS_APERFMPERF;
+
+ if (!mitigate_smt_rsb) {
+ r |= KVM_X86_DISABLE_EXITS_HLT |
+ KVM_X86_DISABLE_EXITS_CSTATE;
+
+ if (kvm_can_mwait_in_guest())
+ r |= KVM_X86_DISABLE_EXITS_MWAIT;
+ }
+ return r;
+}
+
#ifdef CONFIG_KVM_HYPERV
static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 __user *cpuid_arg)
@@ -4573,6 +4766,11 @@ static bool kvm_is_vm_type_supported(unsigned long type)
return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
+static inline u64 kvm_sync_valid_fields(struct kvm *kvm)
+{
+ return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS;
+}
+
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r = 0;
@@ -4585,17 +4783,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_EXT_CPUID:
case KVM_CAP_EXT_EMUL_CPUID:
case KVM_CAP_CLOCKSOURCE:
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CAP_PIT:
+ case KVM_CAP_PIT2:
+ case KVM_CAP_PIT_STATE2:
+ case KVM_CAP_REINJECT_CONTROL:
+#endif
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
case KVM_CAP_USER_NMI:
- case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_IOEVENTFD_NO_LENGTH:
- case KVM_CAP_PIT2:
- case KVM_CAP_PIT_STATE2:
+
case KVM_CAP_SET_IDENTITY_MAP_ADDR:
case KVM_CAP_VCPU_EVENTS:
#ifdef CONFIG_KVM_HYPERV
@@ -4653,6 +4854,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_MEMORY_FAULT_INFO:
case KVM_CAP_X86_GUEST_MODE:
+ case KVM_CAP_ONE_REG:
r = 1;
break;
case KVM_CAP_PRE_FAULT_MEMORY:
@@ -4681,21 +4883,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_SYNC_REGS:
- r = KVM_SYNC_X86_VALID_FIELDS;
+ r = kvm_sync_valid_fields(kvm);
break;
case KVM_CAP_ADJUST_CLOCK:
r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r = KVM_X86_DISABLE_EXITS_PAUSE;
-
- if (!mitigate_smt_rsb) {
- r |= KVM_X86_DISABLE_EXITS_HLT |
- KVM_X86_DISABLE_EXITS_CSTATE;
-
- if (kvm_can_mwait_in_guest())
- r |= KVM_X86_DISABLE_EXITS_MWAIT;
- }
+ r = kvm_get_allowed_disable_exits();
break;
case KVM_CAP_X86_SMM:
if (!IS_ENABLED(CONFIG_KVM_SMM))
@@ -4716,6 +4910,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
+ if (kvm)
+ r = kvm->max_vcpus;
break;
case KVM_CAP_MAX_VCPU_ID:
r = KVM_MAX_VCPU_IDS;
@@ -4771,7 +4967,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
break;
case KVM_CAP_DISABLE_QUIRKS2:
- r = KVM_X86_VALID_QUIRKS;
+ r = kvm_caps.supported_quirks;
break;
case KVM_CAP_X86_NOTIFY_VMEXIT:
r = kvm_caps.has_notify_vmexit;
@@ -4942,21 +5138,18 @@ out:
return r;
}
-static void wbinvd_ipi(void *garbage)
-{
- wbinvd();
-}
-
static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static DEFINE_PER_CPU(struct kvm_vcpu *, last_vcpu);
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (vcpu->scheduled_out && pmu->version && pmu->event_count) {
pmu->need_cleanup = true;
@@ -4968,12 +5161,24 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (kvm_x86_call(has_wbinvd_exit)())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
- smp_call_function_single(vcpu->cpu,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpu(vcpu->cpu);
}
kvm_x86_call(vcpu_load)(vcpu, cpu);
+ if (vcpu != per_cpu(last_vcpu, cpu)) {
+ /*
+ * Flush the branch predictor when switching vCPUs on the same
+ * physical CPU, as each vCPU needs its own branch prediction
+ * domain. No IBPB is needed when switching between L1 and L2
+ * on the same vCPU unless IBRS is advertised to the vCPU; that
+ * is handled on the nested VM-Exit path.
+ */
+ if (static_branch_likely(&switch_vcpu_ibpb))
+ indirect_branch_prediction_barrier();
+ per_cpu(last_vcpu, cpu) = vcpu;
+ }
+
/* Save host pkru register if supported */
vcpu->arch.host_pkru = read_pkru();
@@ -4994,7 +5199,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
u64 offset = kvm_compute_l1_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
- vcpu->arch.tsc_catchup = 1;
+ if (!vcpu->arch.guest_tsc_protected)
+ vcpu->arch.tsc_catchup = 1;
}
if (kvm_lapic_hv_timer_in_use(vcpu))
@@ -5093,6 +5299,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
kvm_x86_call(sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -5103,6 +5312,9 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
{
int r;
+ if (vcpu->arch.apic->guest_apic_protected)
+ return -EINVAL;
+
r = kvm_apic_set_state(vcpu, s);
if (r)
return r;
@@ -5424,12 +5636,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
(events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
- /* INITs are latched while in SMM */
- if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
- (events->smi.smm || events->smi.pending) &&
- vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
- return -EINVAL;
-
process_nmi(vcpu);
/*
@@ -5733,8 +5939,7 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
ns = get_kvmclock_base_ns();
- kvm->arch.user_set_tsc = true;
- __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched, true);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
r = 0;
@@ -5822,15 +6027,140 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
- if (vcpu->arch.pv_cpuid.enforce)
- kvm_update_pv_runtime(vcpu);
-
return 0;
default:
return -EINVAL;
}
}
+struct kvm_x86_reg_id {
+ __u32 index;
+ __u8 type;
+ __u8 rsvd1;
+ __u8 rsvd2:4;
+ __u8 size:4;
+ __u8 x86;
+};
+
+static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu,
+ struct kvm_x86_reg_id *reg)
+{
+ switch (reg->index) {
+ case KVM_REG_GUEST_SSP:
+ /*
+ * FIXME: If host-initiated accesses are ever exempted from
+ * ignore_msrs (in kvm_do_msr_access()), drop this manual check
+ * and rely on KVM's standard checks to reject accesses to regs
+ * that don't exist.
+ */
+ if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK))
+ return -EINVAL;
+
+ reg->type = KVM_X86_REG_TYPE_MSR;
+ reg->index = MSR_KVM_INTERNAL_GUEST_SSP;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
+{
+ u64 val;
+
+ if (do_get_msr(vcpu, msr, &val))
+ return -EINVAL;
+
+ if (put_user(val, user_val))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val)
+{
+ u64 val;
+
+ if (get_user(val, user_val))
+ return -EFAULT;
+
+ if (do_set_msr(vcpu, msr, &val))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_one_reg one_reg;
+ struct kvm_x86_reg_id *reg;
+ u64 __user *user_val;
+ bool load_fpu;
+ int r;
+
+ if (copy_from_user(&one_reg, argp, sizeof(one_reg)))
+ return -EFAULT;
+
+ if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86)
+ return -EINVAL;
+
+ reg = (struct kvm_x86_reg_id *)&one_reg.id;
+ if (reg->rsvd1 || reg->rsvd2)
+ return -EINVAL;
+
+ if (reg->type == KVM_X86_REG_TYPE_KVM) {
+ r = kvm_translate_kvm_reg(vcpu, reg);
+ if (r)
+ return r;
+ }
+
+ if (reg->type != KVM_X86_REG_TYPE_MSR)
+ return -EINVAL;
+
+ if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64)
+ return -EINVAL;
+
+ guard(srcu)(&vcpu->kvm->srcu);
+
+ load_fpu = is_xstate_managed_msr(vcpu, reg->index);
+ if (load_fpu)
+ kvm_load_guest_fpu(vcpu);
+
+ user_val = u64_to_user_ptr(one_reg.addr);
+ if (ioctl == KVM_GET_ONE_REG)
+ r = kvm_get_one_msr(vcpu, reg->index, user_val);
+ else
+ r = kvm_set_one_msr(vcpu, reg->index, user_val);
+
+ if (load_fpu)
+ kvm_put_guest_fpu(vcpu);
+ return r;
+}
+
+static int kvm_get_reg_list(struct kvm_vcpu *vcpu,
+ struct kvm_reg_list __user *user_list)
+{
+ u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0;
+ u64 user_nr_regs;
+
+ if (get_user(user_nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (put_user(nr_regs, &user_list->n))
+ return -EFAULT;
+
+ if (user_nr_regs < nr_regs)
+ return -E2BIG;
+
+ if (nr_regs &&
+ put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0]))
+ return -EFAULT;
+
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5947,6 +6277,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
+ case KVM_GET_ONE_REG:
+ case KVM_SET_ONE_REG:
+ r = kvm_get_set_one_reg(vcpu, ioctl, argp);
+ break;
+ case KVM_GET_REG_LIST:
+ r = kvm_get_reg_list(vcpu, argp);
+ break;
case KVM_TPR_ACCESS_REPORTING: {
struct kvm_tpr_access_ctl tac;
@@ -6127,6 +6464,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
u32 user_tsc_khz;
r = -EINVAL;
+
+ if (vcpu->arch.guest_tsc_protected)
+ goto out;
+
user_tsc_khz = (u32)arg;
if (kvm_caps.has_tsc_control &&
@@ -6284,6 +6625,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_DEVICE_ATTR:
r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
@@ -6330,135 +6677,6 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- memcpy(&chip->chip.pic, &pic->pics[0],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- memcpy(&chip->chip.pic, &pic->pics[1],
- sizeof(struct kvm_pic_state));
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_get_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- return r;
-}
-
-static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
-{
- struct kvm_pic *pic = kvm->arch.vpic;
- int r;
-
- r = 0;
- switch (chip->chip_id) {
- case KVM_IRQCHIP_PIC_MASTER:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[0], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_PIC_SLAVE:
- spin_lock(&pic->lock);
- memcpy(&pic->pics[1], &chip->chip.pic,
- sizeof(struct kvm_pic_state));
- spin_unlock(&pic->lock);
- break;
- case KVM_IRQCHIP_IOAPIC:
- kvm_set_ioapic(kvm, &chip->chip.ioapic);
- break;
- default:
- r = -EINVAL;
- break;
- }
- kvm_pic_update_irq(pic);
- return r;
-}
-
-static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
-
- BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
-
- mutex_lock(&kps->lock);
- memcpy(ps, &kps->channels, sizeof(*ps));
- mutex_unlock(&kps->lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
-{
- int i;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- mutex_lock(&kvm->arch.vpit->pit_state.lock);
- memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
- sizeof(ps->channels));
- ps->flags = kvm->arch.vpit->pit_state.flags;
- mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- memset(&ps->reserved, 0, sizeof(ps->reserved));
- return 0;
-}
-
-static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
-{
- int start = 0;
- int i;
- u32 prev_legacy, cur_legacy;
- struct kvm_pit *pit = kvm->arch.vpit;
-
- mutex_lock(&pit->pit_state.lock);
- prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
- cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
- if (!prev_legacy && cur_legacy)
- start = 1;
- memcpy(&pit->pit_state.channels, &ps->channels,
- sizeof(pit->pit_state.channels));
- pit->pit_state.flags = ps->flags;
- for (i = 0; i < 3; i++)
- kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
- start && i == 0);
- mutex_unlock(&pit->pit_state.lock);
- return 0;
-}
-
-static int kvm_vm_ioctl_reinject(struct kvm *kvm,
- struct kvm_reinject_control *control)
-{
- struct kvm_pit *pit = kvm->arch.vpit;
-
- /* pit->pit_state.lock was overloaded to prevent userspace from getting
- * an inconsistent state after running multiple KVM_REINJECT_CONTROL
- * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
- */
- mutex_lock(&pit->pit_state.lock);
- kvm_pit_set_reinject(pit, control->pit_reinject);
- mutex_unlock(&pit->pit_state.lock);
-
- return 0;
-}
-
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -6471,25 +6689,13 @@ void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
struct kvm_vcpu *vcpu;
unsigned long i;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vcpu_kick(vcpu);
}
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
- bool line_status)
-{
- if (!irqchip_in_kernel(kvm))
- return -ENXIO;
-
- irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event->irq, irq_event->level,
- line_status);
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -6501,11 +6707,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
switch (cap->cap) {
case KVM_CAP_DISABLE_QUIRKS2:
r = -EINVAL;
- if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
+ if (cap->args[0] & ~kvm_caps.supported_quirks)
break;
fallthrough;
case KVM_CAP_DISABLE_QUIRKS:
- kvm->arch.disabled_quirks = cap->args[0];
+ kvm->arch.disabled_quirks |= cap->args[0] & kvm_caps.supported_quirks;
r = 0;
break;
case KVM_CAP_SPLIT_IRQCHIP: {
@@ -6542,30 +6748,26 @@ split_irqchip_unlock:
break;
case KVM_CAP_X86_DISABLE_EXITS:
r = -EINVAL;
- if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
+ if (cap->args[0] & ~kvm_get_allowed_disable_exits())
break;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
- kvm->arch.pause_in_guest = true;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus)
+ goto disable_exits_unlock;
#define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \
"KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests."
- if (!mitigate_smt_rsb) {
- if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() &&
- (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE))
- pr_warn_once(SMT_RSB_MSG);
-
- if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
- kvm_can_mwait_in_guest())
- kvm->arch.mwait_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
- kvm->arch.hlt_in_guest = true;
- if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
- kvm->arch.cstate_in_guest = true;
- }
+ if (!mitigate_smt_rsb && boot_cpu_has_bug(X86_BUG_SMT_RSB) &&
+ cpu_smt_possible() &&
+ (cap->args[0] & ~(KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_APERFMPERF)))
+ pr_warn_once(SMT_RSB_MSG);
+ kvm_disable_exits(kvm, cap->args[0]);
r = 0;
+disable_exits_unlock:
+ mutex_unlock(&kvm->lock);
break;
case KVM_CAP_MSR_PLATFORM_INFO:
kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
@@ -6848,7 +7050,11 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
kvm_free_msr_filter(old_filter);
- kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+ /*
+ * Recalc MSR intercepts as userspace may want to intercept accesses to
+ * MSRs that KVM would otherwise pass through to the guest.
+ */
+ kvm_make_all_cpus_request(kvm, KVM_REQ_RECALC_INTERCEPTS);
return 0;
}
@@ -6914,23 +7120,15 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
unsigned long i;
- int ret = 0;
-
- mutex_lock(&kvm->lock);
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!vcpu->arch.pv_time.active)
- continue;
- ret = kvm_set_guest_paused(vcpu);
- if (ret) {
- kvm_err("Failed to pause guest VCPU%d: %d\n",
- vcpu->vcpu_id, ret);
- break;
- }
- }
- mutex_unlock(&kvm->lock);
+ /*
+ * Ignore the return, marking the guest paused only "fails" if the vCPU
+ * isn't using kvmclock; continuing on is correct and desirable.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ (void)kvm_set_guest_paused(vcpu);
- return ret ? NOTIFY_BAD : NOTIFY_DONE;
+ return NOTIFY_DONE;
}
int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
@@ -7002,14 +7200,29 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
return 0;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ struct kvm_vcpu *vcpu = filp->private_data;
+ void __user *argp = (void __user *)arg;
+
+ if (ioctl == KVM_MEMORY_ENCRYPT_OP &&
+ kvm_x86_ops.vcpu_mem_enc_unlocked_ioctl)
+ return kvm_x86_call(vcpu_mem_enc_unlocked_ioctl)(vcpu, argp);
+
+ return -ENOIOCTLCMD;
+}
+
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r = -ENOTTY;
+
+#ifdef CONFIG_KVM_IOAPIC
/*
* This union makes it completely explicit to gcc-3.x
- * that these two variables' stack usage should be
+ * that these three variables' stack usage should be
* combined, not added together.
*/
union {
@@ -7017,6 +7230,7 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
struct kvm_pit_state2 ps2;
struct kvm_pit_config pit_config;
} u;
+#endif
switch (ioctl) {
case KVM_SET_TSS_ADDR:
@@ -7040,6 +7254,7 @@ set_identity_unlock:
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
break;
+#ifdef CONFIG_KVM_IOAPIC
case KVM_CREATE_IRQCHIP: {
mutex_lock(&kvm->lock);
@@ -7047,6 +7262,15 @@ set_identity_unlock:
if (irqchip_in_kernel(kvm))
goto create_irqchip_unlock;
+ /*
+ * Disallow an in-kernel I/O APIC if the VM has protected EOIs,
+ * i.e. if KVM can't intercept EOIs and thus can't properly
+ * emulate level-triggered interrupts.
+ */
+ r = -ENOTTY;
+ if (kvm->arch.has_protected_eoi)
+ goto create_irqchip_unlock;
+
r = -EINVAL;
if (kvm->created_vcpus)
goto create_irqchip_unlock;
@@ -7061,7 +7285,7 @@ set_identity_unlock:
goto create_irqchip_unlock;
}
- r = kvm_setup_default_irq_routing(kvm);
+ r = kvm_setup_default_ioapic_and_pic_routing(kvm);
if (r) {
kvm_ioapic_destroy(kvm);
kvm_pic_destroy(kvm);
@@ -7109,7 +7333,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -7133,7 +7357,7 @@ set_identity_unlock:
}
r = -ENXIO;
- if (!irqchip_kernel(kvm))
+ if (!irqchip_full(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
set_irqchip_out:
@@ -7206,6 +7430,7 @@ set_pit2_out:
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+#endif
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
@@ -7276,23 +7501,25 @@ set_pit2_out:
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
- r = 0;
-
+ mutex_lock(&kvm->lock);
+ if (!kvm->created_vcpus) {
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+ }
+ mutex_unlock(&kvm->lock);
goto out;
}
case KVM_GET_TSC_KHZ: {
r = READ_ONCE(kvm->arch.default_tsc_khz);
goto out;
}
- case KVM_MEMORY_ENCRYPT_OP: {
+ case KVM_MEMORY_ENCRYPT_OP:
r = -ENOTTY;
if (!kvm_x86_ops.mem_enc_ioctl)
goto out;
r = kvm_x86_call(mem_enc_ioctl)(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_REG_REGION: {
struct kvm_enc_region region;
@@ -7431,6 +7658,7 @@ static void kvm_probe_msr_to_save(u32 msr_index)
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET:
if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2))
return;
break;
@@ -7443,6 +7671,24 @@ static void kvm_probe_msr_to_save(u32 msr_index)
if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
return;
break;
+ case MSR_IA32_XSS:
+ if (!kvm_caps.supported_xss)
+ return;
+ break;
+ case MSR_IA32_U_CET:
+ case MSR_IA32_S_CET:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ return;
+ break;
+ case MSR_IA32_INT_SSP_TAB:
+ if (!kvm_cpu_cap_has(X86_FEATURE_LM))
+ return;
+ fallthrough;
+ case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP:
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK))
+ return;
+ break;
default:
break;
}
@@ -7562,7 +7808,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read);
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
@@ -7573,7 +7819,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
access |= PFERR_WRITE_MASK;
return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
}
-EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
/* uses this to access any guest's mapped memory without checking CPL */
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
@@ -7659,7 +7905,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
-EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_virt);
static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
@@ -7726,12 +7972,12 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
unsigned int bytes, struct x86_exception *exception)
{
/* kvm_write_guest_virt_system can pull in tons of pages. */
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
-EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_virt_system);
static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len)
@@ -7765,7 +8011,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
return kvm_emulate_instruction(vcpu, emul_type);
}
-EXPORT_SYMBOL_GPL(handle_ud);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_ud);
static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t gpa, bool write)
@@ -7986,7 +8232,7 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
return rc;
if (!vcpu->mmio_nr_fragments)
- return rc;
+ return X86EMUL_CONTINUE;
gpa = vcpu->mmio_fragments[0].gpa;
@@ -8231,8 +8477,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
- on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
- wbinvd_ipi, NULL, 1);
+ wbinvd_on_cpus_mask(vcpu->arch.wbinvd_dirty_mask);
put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
} else
@@ -8245,7 +8490,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
kvm_emulate_wbinvd_noskip(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_wbinvd);
@@ -8432,7 +8677,7 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+ r = kvm_emulate_msr_read(vcpu, msr_index, pdata);
if (r < 0)
return X86EMUL_UNHANDLEABLE;
@@ -8455,7 +8700,7 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+ r = kvm_emulate_msr_write(vcpu, msr_index, data);
if (r < 0)
return X86EMUL_UNHANDLEABLE;
@@ -8475,7 +8720,16 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ /*
+ * Treat emulator accesses to the current shadow stack pointer as host-
+ * initiated, as they aren't true MSR accesses (SSP is a "just a reg"),
+ * and this API is used only for implicit accesses, i.e. not RDMSR, and
+ * so the index is fully KVM-controlled.
+ */
+ if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP))
+ return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
+
+ return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata);
}
static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc)
@@ -8511,17 +8765,17 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
}
static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
{
- return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+ return guest_cpu_cap_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
static bool emulator_guest_cpuid_is_intel_compatible(struct x86_emulate_ctxt *ctxt)
@@ -8549,11 +8803,6 @@ static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
return is_smm(emul_to_vcpu(ctxt));
}
-static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
-{
- return is_guest_mode(emul_to_vcpu(ctxt));
-}
-
#ifndef CONFIG_KVM_SMM
static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
{
@@ -8567,6 +8816,14 @@ static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
+static int emulator_get_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 *xcr)
+{
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ *xcr = emul_to_vcpu(ctxt)->arch.xcr0;
+ return 0;
+}
+
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
{
return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
@@ -8637,9 +8894,9 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_cpuid_is_intel_compatible = emulator_guest_cpuid_is_intel_compatible,
.set_nmi_mask = emulator_set_nmi_mask,
.is_smm = emulator_is_smm,
- .is_guest_mode = emulator_is_guest_mode,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
+ .get_xcr = emulator_get_xcr,
.set_xcr = emulator_set_xcr,
.get_untagged_addr = emulator_get_untagged_addr,
.is_canonical_addr = emulator_is_canonical_addr,
@@ -8740,7 +8997,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
kvm_set_rflags(vcpu, ctxt->eflags);
}
}
-EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_inject_realmode_interrupt);
static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
u8 ndata, u8 *insn_bytes, u8 insn_size)
@@ -8805,13 +9062,47 @@ void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
{
prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
}
-EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_prepare_emulation_failure_exit);
void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
{
__kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
}
-EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ u32 reason, intr_info, error_code;
+ struct kvm_run *run = vcpu->run;
+ u64 info1, info2;
+ int ndata = 0;
+
+ kvm_x86_call(get_exit_info)(vcpu, &reason, &info1, &info2,
+ &intr_info, &error_code);
+
+ run->internal.data[ndata++] = info2;
+ run->internal.data[ndata++] = reason;
+ run->internal.data[ndata++] = info1;
+ run->internal.data[ndata++] = gpa;
+ run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ run->internal.ndata = ndata;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_event_vectoring_exit);
+
+void kvm_prepare_unexpected_reason_exit(struct kvm_vcpu *vcpu, u64 exit_reason)
+{
+ vcpu_unimpl(vcpu, "unexpected exit reason 0x%llx\n", exit_reason);
+
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prepare_unexpected_reason_exit);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
@@ -8921,7 +9212,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (unlikely(!r))
return 0;
- kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
+ kvm_pmu_instruction_retired(vcpu);
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -8935,7 +9226,7 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
r = kvm_vcpu_do_singlestep(vcpu);
return r;
}
-EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_skip_emulated_instruction);
static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
@@ -9041,6 +9332,23 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
+static bool is_soft_int_instruction(struct x86_emulate_ctxt *ctxt,
+ int emulation_type)
+{
+ u8 vector = EMULTYPE_GET_SOFT_INT_VECTOR(emulation_type);
+
+ switch (ctxt->b) {
+ case 0xcc:
+ return vector == BP_VECTOR;
+ case 0xcd:
+ return vector == ctxt->src.val;
+ case 0xce:
+ return vector == OF_VECTOR;
+ default:
+ return false;
+ }
+}
+
/*
* Decode an instruction for emulation. The caller is responsible for handling
* code breakpoints. Note, manually detecting code breakpoints is unnecessary
@@ -9066,7 +9374,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
return r;
}
-EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(x86_decode_emulated_instruction);
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
@@ -9085,11 +9393,20 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
return 1;
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
+ return 1;
+
+ if (r == X86EMUL_UNHANDLEABLE_VECTORING) {
+ kvm_prepare_event_vectoring_exit(vcpu, cr2_or_gpa);
+ return 0;
+ }
+
WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE);
return handle_emulation_failure(vcpu, emulation_type);
}
- vcpu->arch.l1tf_flush_l1d = true;
+ kvm_request_l1tf_flush_l1d();
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
@@ -9142,6 +9459,10 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* injecting single-step #DBs.
*/
if (emulation_type & EMULTYPE_SKIP) {
+ if (emulation_type & EMULTYPE_SKIP_SOFT_INT &&
+ !is_soft_int_instruction(ctxt, emulation_type))
+ return 0;
+
if (ctxt->mode != X86EMUL_MODE_PROT64)
ctxt->eip = (u32)ctxt->_eip;
else
@@ -9191,7 +9512,14 @@ restart:
ctxt->exception.address = 0;
}
- r = x86_emulate_insn(ctxt);
+ /*
+ * Check L1's instruction intercepts when emulating instructions for
+ * L2, unless KVM is re-emulating a previously decoded instruction,
+ * e.g. to complete userspace I/O, in which case KVM has already
+ * checked the intercepts.
+ */
+ r = x86_emulate_insn(ctxt, is_guest_mode(vcpu) &&
+ !(emulation_type & EMULTYPE_NO_DECODE));
if (r == EMULATION_INTERCEPTED)
return 1;
@@ -9246,9 +9574,9 @@ writeback:
*/
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
- kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.INSTRUCTIONS_RETIRED);
+ kvm_pmu_instruction_retired(vcpu);
if (ctxt->is_branch)
- kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED);
+ kvm_pmu_branch_retired(vcpu);
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
@@ -9274,14 +9602,14 @@ int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
{
return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction);
int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
void *insn, int insn_len)
{
return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_instruction_from_buffer);
static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
{
@@ -9293,7 +9621,7 @@ static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
{
vcpu->arch.pio.count = 0;
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip)))
return 1;
return kvm_skip_emulated_instruction(vcpu);
@@ -9318,7 +9646,7 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
complete_fast_pio_out_port_0x7e;
kvm_skip_emulated_instruction(vcpu);
} else {
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_out;
}
return 0;
@@ -9331,7 +9659,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
/* We should only ever be called with arch.pio.count equal to 1 */
BUG_ON(vcpu->arch.pio.count != 1);
- if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
+ if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.cui_linear_rip))) {
vcpu->arch.pio.count = 0;
return 1;
}
@@ -9360,7 +9688,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
return ret;
}
- vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
+ vcpu->arch.cui_linear_rip = kvm_get_linear_rip(vcpu);
vcpu->arch.complete_userspace_io = complete_fast_pio_in;
return 0;
@@ -9376,7 +9704,7 @@ int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
ret = kvm_fast_pio_out(vcpu, size, port);
return ret && kvm_skip_emulated_instruction(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_fast_pio);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fast_pio);
static int kvmclock_cpu_down_prep(unsigned int cpu)
{
@@ -9693,12 +10021,24 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
* with an exception. PAT[0] is set to WB on RESET and also by the
* kernel, i.e. failure indicates a kernel bug or broken firmware.
*/
- if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
+ if (rdmsrq_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
pr_err("host PAT[0] is not WB\n");
return -EIO;
}
+ if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) {
+ rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet);
+ /*
+ * Linux doesn't yet support supervisor shadow stacks (SSS), so
+ * KVM doesn't save/restore the associated MSRs, i.e. KVM may
+ * clobber the host values. Yell and refuse to load if SSS is
+ * unexpectedly enabled, e.g. to avoid crashing the host.
+ */
+ if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN))
+ return -EIO;
+ }
+
memset(&kvm_caps, 0, sizeof(kvm_caps));
x86_emulator_cache = kvm_alloc_emulator_cache();
@@ -9707,17 +10047,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -ENOMEM;
}
- user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
- if (!user_return_msrs) {
- pr_err("failed to allocate percpu kvm_user_return_msrs\n");
- r = -ENOMEM;
- goto out_free_x86_emulator_cache;
- }
- kvm_nr_uret_msrs = 0;
-
r = kvm_mmu_vendor_module_init();
if (r)
- goto out_free_percpu;
+ goto out_free_x86_emulator_cache;
kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
@@ -9727,20 +10059,30 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0;
}
- rdmsrl_safe(MSR_EFER, &kvm_host.efer);
+ if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+ rdmsrq(MSR_IA32_XSS, kvm_host.xss);
+ kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS;
+ }
+
+ kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS;
+ kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS;
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, kvm_host.xss);
+ rdmsrq_safe(MSR_EFER, &kvm_host.efer);
kvm_init_pmu_capability(ops->pmu_ops);
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
+ rdmsrq(MSR_IA32_ARCH_CAPABILITIES, kvm_host.arch_capabilities);
+
+ WARN_ON_ONCE(kvm_nr_uret_msrs);
r = ops->hardware_setup();
if (r != 0)
goto out_mmu_exit;
+ enable_device_posted_irqs &= enable_apicv &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+
kvm_ops_update(ops);
for_each_online_cpu(cpu) {
@@ -9770,12 +10112,22 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+ /* KVM always ignores guest PAT for shadow paging. */
+ if (!tdp_enabled)
+ kvm_caps.supported_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
+ if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) &&
+ !kvm_cpu_cap_has(X86_FEATURE_IBT))
+ kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
+
+ if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) {
+ kvm_cpu_cap_clear(X86_FEATURE_SHSTK);
+ kvm_cpu_cap_clear(X86_FEATURE_IBT);
+ kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL;
+ }
if (kvm_caps.has_tsc_control) {
/*
@@ -9796,14 +10148,13 @@ out_unwind_ops:
kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
-out_free_percpu:
- free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
return r;
}
-EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_init);
void kvm_x86_vendor_exit(void)
{
@@ -9826,8 +10177,8 @@ void kvm_x86_vendor_exit(void)
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_call(hardware_unsetup)();
+ kvm_destroy_user_return_msrs();
kvm_mmu_vendor_module_exit();
- free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
@@ -9837,7 +10188,7 @@ void kvm_x86_vendor_exit(void)
kvm_x86_ops.enable_virtualization_cpu = NULL;
mutex_unlock(&vendor_module_lock);
}
-EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_x86_vendor_exit);
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
@@ -9901,7 +10252,7 @@ bool kvm_apicv_activated(struct kvm *kvm)
{
return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
}
-EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_apicv_activated);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
{
@@ -9911,7 +10262,7 @@ bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
return (vm_reasons | vcpu_reasons) == 0;
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
@@ -9951,8 +10302,11 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
- if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
- target = map->phys_map[dest_id]->vcpu;
+ if (likely(map) && dest_id <= map->max_apic_id) {
+ dest_id = array_index_nospec(dest_id, map->max_apic_id + 1);
+ if (map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+ }
rcu_read_unlock();
@@ -9976,19 +10330,24 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
{
u64 ret = vcpu->run->hypercall.ret;
- if (!is_64_bit_mode(vcpu))
+ if (!is_64_bit_hypercall(vcpu))
ret = (u32)ret;
kvm_rax_write(vcpu, ret);
- ++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
- unsigned long a0, unsigned long a1,
- unsigned long a2, unsigned long a3,
- int op_64_bit, int cpl)
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl,
+ int (*complete_hypercall)(struct kvm_vcpu *))
{
unsigned long ret;
+ unsigned long nr = kvm_rax_read(vcpu);
+ unsigned long a0 = kvm_rbx_read(vcpu);
+ unsigned long a1 = kvm_rcx_read(vcpu);
+ unsigned long a2 = kvm_rdx_read(vcpu);
+ unsigned long a3 = kvm_rsi_read(vcpu);
+ int op_64_bit = is_64_bit_hypercall(vcpu);
+
+ ++vcpu->stat.hypercalls;
trace_kvm_hypercall(nr, a0, a1, a2, a3);
@@ -10041,7 +10400,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
u64 gpa = a0, npages = a1, attrs = a2;
ret = -KVM_ENOSYS;
- if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
break;
if (!PAGE_ALIGNED(gpa) || !npages ||
@@ -10052,6 +10411,13 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ /*
+ * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+ * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+ * it was always zero on KVM_EXIT_HYPERCALL. Since KVM is now overwriting
+ * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+ */
+ vcpu->run->hypercall.ret = 0;
vcpu->run->hypercall.args[0] = gpa;
vcpu->run->hypercall.args[1] = npages;
vcpu->run->hypercall.args[2] = attrs;
@@ -10060,8 +10426,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;
WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
- vcpu->arch.complete_userspace_io = complete_hypercall_exit;
- /* stat is incremented on completion. */
+ vcpu->arch.complete_userspace_io = complete_hypercall;
return 0;
}
default:
@@ -10070,43 +10435,23 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
}
out:
- ++vcpu->stat.hypercalls;
- return ret;
+ vcpu->run->hypercall.ret = ret;
+ return 1;
}
-EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(____kvm_emulate_hypercall);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit;
- int cpl;
-
if (kvm_xen_hypercall_enabled(vcpu->kvm))
return kvm_xen_hypercall(vcpu);
if (kvm_hv_hypercall_enabled(vcpu))
return kvm_hv_hypercall(vcpu);
- nr = kvm_rax_read(vcpu);
- a0 = kvm_rbx_read(vcpu);
- a1 = kvm_rcx_read(vcpu);
- a2 = kvm_rdx_read(vcpu);
- a3 = kvm_rsi_read(vcpu);
- op_64_bit = is_64_bit_hypercall(vcpu);
- cpl = kvm_x86_call(get_cpl)(vcpu);
-
- ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
- if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
- /* MAP_GPA tosses the request to the user space. */
- return 0;
-
- if (!op_64_bit)
- ret = (u32)ret;
- kvm_rax_write(vcpu, ret);
-
- return kvm_skip_emulated_instruction(vcpu);
+ return __kvm_emulate_hypercall(vcpu, kvm_x86_call(get_cpl)(vcpu),
+ complete_hypercall_exit);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_hypercall);
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
@@ -10549,7 +10894,7 @@ out:
preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_update_apicv);
static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
@@ -10625,7 +10970,7 @@ void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
__kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
up_write(&kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_or_clear_apicv_inhibit);
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
@@ -10633,13 +10978,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
return;
bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
+ vcpu->arch.highest_stale_pending_ioapic_eoi = -1;
kvm_x86_call(sync_pir_to_irr)(vcpu);
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
+#ifdef CONFIG_KVM_IOAPIC
else if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
+#endif
if (is_guest_mode(vcpu))
vcpu->arch.load_eoi_exitmap_pending = true;
@@ -10693,6 +11041,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
+ u64 run_flags, debug_ctl;
bool req_immediate_exit = false;
@@ -10840,8 +11189,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_vcpu_update_apicv(vcpu);
if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
kvm_check_async_pf_completion(vcpu);
- if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
- kvm_x86_call(msr_filter_changed)(vcpu);
+
+ if (kvm_check_request(KVM_REQ_RECALC_INTERCEPTS, vcpu))
+ kvm_x86_call(recalc_intercepts)(vcpu);
if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
kvm_x86_call(update_cpu_dirty_logging)(vcpu);
@@ -10937,28 +11287,55 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto cancel_injection;
}
- if (req_immediate_exit)
+ run_flags = 0;
+ if (req_immediate_exit) {
+ run_flags |= KVM_RUN_FORCE_IMMEDIATE_EXIT;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_return();
if (vcpu->arch.guest_fpu.xfd_err)
- wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+ wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
- if (unlikely(vcpu->arch.switch_db_regs)) {
- set_debugreg(0, 7);
+ kvm_load_xfeatures(vcpu, true);
+
+ if (unlikely(vcpu->arch.switch_db_regs &&
+ !(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
+ set_debugreg(DR7_FIXED_1, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ run_flags |= KVM_RUN_LOAD_GUEST_DR6;
} else if (unlikely(hw_breakpoint_active())) {
- set_debugreg(0, 7);
+ set_debugreg(DR7_FIXED_1, 7);
}
+ /*
+ * Refresh the host DEBUGCTL snapshot after disabling IRQs, as DEBUGCTL
+ * can be modified in IRQ context, e.g. via SMP function calls. Inform
+ * vendor code if any host-owned bits were changed, e.g. so that the
+ * value loaded into hardware while running the guest can be updated.
+ */
+ debug_ctl = get_debugctlmsr();
+ if ((debug_ctl ^ vcpu->arch.host_debugctl) & kvm_x86_ops.HOST_OWNED_DEBUGCTL &&
+ !vcpu->arch.guest_state_protected)
+ run_flags |= KVM_RUN_LOAD_DEBUGCTL;
+ vcpu->arch.host_debugctl = debug_ctl;
+
guest_timing_enter_irqoff();
+ /*
+ * Swap PKRU with hardware breakpoints disabled to minimize the number
+ * of flows where non-KVM code can run with guest state loaded.
+ */
+ kvm_load_guest_pkru(vcpu);
+
for (;;) {
/*
* Assert that vCPU vs. VM APICv state is consistent. An APICv
@@ -10969,8 +11346,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
(kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
- exit_fastpath = kvm_x86_call(vcpu_run)(vcpu,
- req_immediate_exit);
+ exit_fastpath = kvm_x86_call(vcpu_run)(vcpu, run_flags);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
@@ -10982,10 +11358,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
break;
}
+ run_flags = 0;
+
/* Note, VM-Exits that go down the "slow" path are accounted below. */
++vcpu->stat.exits;
}
+ kvm_load_host_pkru(vcpu);
+
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -10994,6 +11374,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
+ WARN_ON(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH);
kvm_x86_call(sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
@@ -11015,6 +11396,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_load_xfeatures(vcpu, false);
+
/*
* Sync xfd before calling handle_exit_irqoff() which may
* rely on the fact that guest_fpu::xfd is up-to-date (e.g.
@@ -11026,7 +11409,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_call(handle_exit_irqoff)(vcpu);
if (vcpu->arch.guest_fpu.xfd_err)
- wrmsrl(MSR_IA32_XFD_ERR, 0);
+ wrmsrq(MSR_IA32_XFD_ERR, 0);
+
+ /*
+ * Mark this CPU as needing a branch predictor flush before running
+ * userspace. Must be done before enabling preemption to ensure it gets
+ * set for the CPU that actually ran the guest, and not the CPU that it
+ * may migrate to.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER))
+ this_cpu_write(x86_ibpb_exit_to_user, true);
/*
* Consume any pending interrupts, including the possible source of
@@ -11064,7 +11456,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* Profile KVM exit RIPs:
*/
- if (unlikely(prof_on == KVM_PROFILING)) {
+ if (unlikely(prof_on == KVM_PROFILING &&
+ !vcpu->arch.guest_state_protected)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}
@@ -11097,7 +11490,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted);
}
-static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
{
if (!list_empty_careful(&vcpu->async_pf.done))
return true;
@@ -11106,9 +11499,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_apic_init_sipi_allowed(vcpu))
return true;
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
if (kvm_is_exception_pending(vcpu))
return true;
@@ -11146,10 +11536,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
return false;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_has_events);
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+ return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted ||
+ kvm_vcpu_has_events(vcpu);
}
/* Called within kvm->srcu read side. */
@@ -11207,9 +11599,7 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
case KVM_MP_STATE_AP_RESET_HOLD:
- vcpu->arch.pv.pv_unhalted = false;
- vcpu->arch.mp_state =
- KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
fallthrough;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
@@ -11264,7 +11654,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (__xfer_to_guest_mode_work_pending()) {
kvm_vcpu_srcu_read_unlock(vcpu);
- r = xfer_to_guest_mode_handle_work(vcpu);
+ r = kvm_xfer_to_guest_mode_handle_work(vcpu);
kvm_vcpu_srcu_read_lock(vcpu);
if (r)
return r;
@@ -11285,10 +11675,9 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
*/
++vcpu->stat.halt_exits;
if (lapic_in_kernel(vcpu)) {
- if (kvm_vcpu_has_events(vcpu))
- vcpu->arch.pv.pv_unhalted = false;
- else
- vcpu->arch.mp_state = state;
+ if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
+ state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, state);
return 1;
} else {
vcpu->run->exit_reason = reason;
@@ -11300,7 +11689,7 @@ int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
{
return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt_noskip);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
@@ -11311,17 +11700,11 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
*/
return kvm_emulate_halt_noskip(vcpu) && ret;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt);
fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
{
- int ret;
-
- kvm_vcpu_srcu_read_lock(vcpu);
- ret = kvm_emulate_halt(vcpu);
- kvm_vcpu_srcu_read_unlock(vcpu);
-
- if (!ret)
+ if (!kvm_emulate_halt(vcpu))
return EXIT_FASTPATH_EXIT_USERSPACE;
if (kvm_vcpu_running(vcpu))
@@ -11329,7 +11712,7 @@ fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
return EXIT_FASTPATH_EXIT_HANDLED;
}
-EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(handle_fastpath_hlt);
int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
{
@@ -11338,7 +11721,7 @@ int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
KVM_EXIT_AP_RESET_HOLD) && ret;
}
-EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_ap_reset_hold);
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
@@ -11444,6 +11827,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
@@ -11452,17 +11838,47 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
+static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ /*
+ * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and
+ * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted
+ * by KVM_SET_VCPU_EVENTS for backwards compatibility, but should be
+ * converted to INIT_RECEIVED.
+ */
+ if (WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED))
+ return -EINVAL;
+
+ /*
+ * Disallow running the vCPU if userspace forced it into an impossible
+ * MP_STATE, e.g. if the vCPU is in WFS but SIPI is blocked.
+ */
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED &&
+ !kvm_apic_init_sipi_allowed(vcpu))
+ return -EINVAL;
+
+ return kvm_x86_call(vcpu_pre_run)(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
+ u64 sync_valid_fields;
int r;
+ r = kvm_mmu_post_init_vm(vcpu->kvm);
+ if (r)
+ return r;
+
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
kvm_run->flags = 0;
@@ -11502,8 +11918,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
- (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
+ sync_valid_fields = kvm_sync_valid_fields(vcpu->kvm);
+ if ((kvm_run->kvm_valid_regs & ~sync_valid_fields) ||
+ (kvm_run->kvm_dirty_regs & ~sync_valid_fields)) {
r = -EINVAL;
goto out;
}
@@ -11553,7 +11970,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- r = kvm_x86_call(vcpu_pre_run)(vcpu);
+ r = kvm_x86_vcpu_pre_run(vcpu);
if (r <= 0)
goto out;
@@ -11561,7 +11978,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
out:
kvm_put_guest_fpu(vcpu);
- if (kvm_run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs && likely(!vcpu->arch.guest_state_protected))
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -11746,8 +12163,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int r;
vcpu_load(vcpu);
- if (kvm_mpx_supported())
- kvm_load_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_apic_accept_events(vcpu);
if (r < 0)
@@ -11762,8 +12178,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
mp_state->mp_state = vcpu->arch.mp_state;
out:
- if (kvm_mpx_supported())
- kvm_put_guest_fpu(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -11793,21 +12208,16 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
- * forcing the guest into INIT/SIPI if those events are supposed to be
- * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
- * if an SMI is pending as well.
+ * SIPI_RECEIVED is obsolete and no longer used internally; KVM instead
+ * leaves the vCPU in INIT_RECIEVED (Wait-For-SIPI) and pends the SIPI.
+ * Translate SIPI_RECEIVED as appropriate for backwards compatibility.
*/
- if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
- (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
- mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
- goto out;
-
if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
- vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+ mp_state->mp_state = KVM_MP_STATE_INIT_RECEIVED;
set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
- } else
- vcpu->arch.mp_state = mp_state->mp_state;
+ }
+
+ kvm_set_mp_state(vcpu, mp_state->mp_state);
kvm_make_request(KVM_REQ_EVENT, vcpu);
ret = 0;
@@ -11822,6 +12232,25 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int ret;
+ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) {
+ u64 u_cet, s_cet;
+
+ /*
+ * Check both User and Supervisor on task switches as inter-
+ * privilege level task switches are impacted by CET at both
+ * the current privilege level and the new privilege level, and
+ * that information is not known at this time. The expectation
+ * is that the guest won't require emulation of task switches
+ * while using IBT or Shadow Stacks.
+ */
+ if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) ||
+ __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet))
+ goto unhandled_task_switch;
+
+ if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN))
+ goto unhandled_task_switch;
+ }
+
init_emulate_ctxt(vcpu);
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
@@ -11831,19 +12260,21 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
* Report an error userspace if MMIO is needed, as KVM doesn't support
* MMIO during a task switch (or any other complex operation).
*/
- if (ret || vcpu->mmio_needed) {
- vcpu->mmio_needed = false;
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- return 0;
- }
+ if (ret || vcpu->mmio_needed)
+ goto unhandled_task_switch;
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
return 1;
+
+unhandled_task_switch:
+ vcpu->mmio_needed = false;
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_task_switch);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_task_switch);
static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
@@ -11934,7 +12365,7 @@ static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
!is_protmode(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
return 0;
}
@@ -12237,9 +12668,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm);
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_UNINITIALIZED);
r = kvm_mmu_create(vcpu);
if (r < 0)
@@ -12276,9 +12707,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_emulate_ctxt;
}
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
-
kvm_async_pf_hash_reset(vcpu);
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_STUFF_FEATURE_MSRS)) {
@@ -12301,6 +12729,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
kvm_xen_init_vcpu(vcpu);
vcpu_load(vcpu);
+ kvm_vcpu_after_set_cpuid(vcpu);
kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu);
@@ -12326,8 +12755,6 @@ fail_mmu_destroy:
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vcpu->kvm;
-
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
@@ -12338,18 +12765,20 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.msr_kvm_poll_control = 1;
mutex_unlock(&vcpu->mutex);
-
- if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- int idx;
+ int idx, cpu;
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_mmu_unload(vcpu);
kvmclock_reset(vcpu);
+ for_each_possible_cpu(cpu)
+ cmpxchg(per_cpu_ptr(&last_vcpu, cpu), vcpu, NULL);
+
kvm_x86_call(vcpu_free)(vcpu);
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
@@ -12369,6 +12798,53 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvfree(vcpu->arch.cpuid_entries);
}
+static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
+ u64 xfeatures_mask;
+ bool fpu_in_use;
+ int i;
+
+ /*
+ * Guest FPU state is zero allocated and so doesn't need to be manually
+ * cleared on RESET, i.e. during vCPU creation.
+ */
+ if (!init_event || !fpstate)
+ return;
+
+ /*
+ * On INIT, only select XSTATE components are zeroed, most components
+ * are unchanged. Currently, the only components that are zeroed and
+ * supported by KVM are MPX and CET related.
+ */
+ xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) &
+ (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR |
+ XFEATURE_MASK_CET_ALL);
+ if (!xfeatures_mask)
+ return;
+
+ BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
+
+ /*
+ * Unload guest FPU state (if necessary) before zeroing XSTATE fields
+ * as the kernel can only modify the state when its resident in memory,
+ * i.e. when it's not loaded into hardware.
+ *
+ * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
+ * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
+ * only path that can trigger INIT emulation _and_ loads FPU state, and
+ * KVM_RUN should _always_ load FPU state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+ fpu_in_use = fpstate->in_use;
+ if (fpu_in_use)
+ kvm_put_guest_fpu(vcpu);
+ for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
+ fpstate_clear_xstate_component(fpstate, i);
+ if (fpu_in_use)
+ kvm_load_guest_fpu(vcpu);
+}
+
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_cpuid_entry2 *cpuid_0x1;
@@ -12426,22 +12902,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
- struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
-
- /*
- * All paths that lead to INIT are required to load the guest's
- * FPU state (because most paths are buried in KVM_RUN).
- */
- if (init_event)
- kvm_put_guest_fpu(vcpu);
-
- fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
- fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
-
- if (init_event)
- kvm_load_guest_fpu(vcpu);
- }
+ kvm_xstate_reset(vcpu, init_event);
if (!init_event) {
vcpu->arch.smbase = 0x30000;
@@ -12453,7 +12914,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
__kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
- __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
+ kvm_msr_write(vcpu, MSR_IA32_XSS, 0);
}
/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@@ -12519,7 +12980,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (init_event)
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_reset);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
@@ -12531,7 +12992,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
kvm_rip_write(vcpu, 0);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_deliver_sipi_vector);
void kvm_arch_enable_virtualization(void)
{
@@ -12642,13 +13103,28 @@ int kvm_arch_enable_virtualization_cpu(void)
void kvm_arch_disable_virtualization_cpu(void)
{
kvm_x86_call(disable_virtualization_cpu)();
- drop_user_return_notifiers();
+
+ /*
+ * Leave the user-return notifiers as-is when disabling virtualization
+ * for reboot, i.e. when disabling via IPI function call, and instead
+ * pin kvm.ko (if it's a module) to defend against use-after-free (in
+ * the *very* unlikely scenario module unload is racing with reboot).
+ * On a forced reboot, tasks aren't frozen before shutdown, and so KVM
+ * could be actively modifying user-return MSR state when the IPI to
+ * disable virtualization arrives. Handle the extreme edge case here
+ * instead of trying to account for it in the normal flows.
+ */
+ if (in_task() || WARN_ON_ONCE(!kvm_rebooting))
+ drop_user_return_notifiers();
+ else
+ __module_get(THIS_MODULE);
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
@@ -12678,26 +13154,22 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
/* Decided by the vendor code for other VM types. */
kvm->arch.pre_fault_allowed =
type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
+ kvm->arch.disabled_quirks = kvm_caps.inapplicable_quirks & kvm_caps.supported_quirks;
ret = kvm_page_track_init(kvm);
if (ret)
goto out;
- kvm_mmu_init_vm(kvm);
+ ret = kvm_mmu_init_vm(kvm);
+ if (ret)
+ goto out_cleanup_page_track;
ret = kvm_x86_call(vm_init)(kvm);
if (ret)
goto out_uninit_mmu;
- INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
- /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
- set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
- /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
- set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
- &kvm->arch.irq_sources_bitmap);
-
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
@@ -12717,52 +13189,28 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.hv_root_tdp = INVALID_PAGE;
#endif
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
- INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
-
kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
kvm_xen_init_vm(kvm);
+ if (ignore_msrs && !report_ignored_msrs) {
+ pr_warn_once("Running KVM with ignore_msrs=1 and report_ignored_msrs=0 is not a\n"
+ "a supported configuration. Lying to the guest about the existence of MSRs\n"
+ "may cause the guest operating system to hang or produce errors. If a guest\n"
+ "does not run without ignore_msrs=1, please report it to kvm@vger.kernel.org.\n");
+ }
+
+ once_init(&kvm->arch.nx_once);
return 0;
out_uninit_mmu:
kvm_mmu_uninit_vm(kvm);
+out_cleanup_page_track:
kvm_page_track_cleanup(kvm);
out:
return ret;
}
-int kvm_arch_post_init_vm(struct kvm *kvm)
-{
- return kvm_mmu_post_init_vm(kvm);
-}
-
-static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
-{
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-}
-
-static void kvm_unload_vcpu_mmus(struct kvm *kvm)
-{
- unsigned long i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- kvm_clear_async_pf_completion_queue(vcpu);
- kvm_unload_vcpu_mmu(vcpu);
- }
-}
-
-void kvm_arch_sync_events(struct kvm *kvm)
-{
- cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
- cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
- kvm_free_pit(kvm);
-}
-
/**
* __x86_set_memory_region: Setup KVM internal memory slot
*
@@ -12793,7 +13241,8 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
struct kvm_memslots *slots = kvm_memslots(kvm);
struct kvm_memory_slot *slot;
- /* Called with kvm->slots_lock held. */
+ lockdep_assert_held(&kvm->slots_lock);
+
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
return ERR_PTR_USR(-EINVAL);
@@ -12826,7 +13275,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
m.guest_phys_addr = gpa;
m.userspace_addr = hva;
m.memory_size = size;
- r = __kvm_set_memory_region(kvm, &m);
+ r = kvm_set_internal_memslot(kvm, &m);
if (r < 0)
return ERR_PTR_USR(r);
}
@@ -12836,11 +13285,22 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
return (void __user *)hva;
}
-EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(__x86_set_memory_region);
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
+ /*
+ * Stop all background workers and kthreads before destroying vCPUs, as
+ * iterating over vCPUs in a different task while vCPUs are being freed
+ * is unsafe, i.e. will lead to use-after-free. The PIT also needs to
+ * be stopped before IRQ routing is freed.
+ */
+#ifdef CONFIG_KVM_IOAPIC
+ kvm_free_pit(kvm);
+#endif
+
kvm_mmu_pre_destroy_vm(kvm);
+ static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
@@ -12859,18 +13319,19 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
- kvm_unload_vcpu_mmus(kvm);
- kvm_x86_call(vm_destroy)(kvm);
+ kvm_destroy_vcpus(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
+#ifdef CONFIG_KVM_IOAPIC
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
- kvm_destroy_vcpus(kvm);
+#endif
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
kvm_xen_destroy_vm(kvm);
kvm_hv_destroy_vm(kvm);
+ kvm_x86_call(vm_destroy)(kvm);
}
static void memslot_rmap_free(struct kvm_memory_slot *slot)
@@ -12927,7 +13388,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
- * old arrays will be freed by __kvm_set_memory_region() if installing
+ * old arrays will be freed by kvm_set_memory_region() if installing
* the new memslot is successful.
*/
memset(&slot->arch, 0, sizeof(slot->arch));
@@ -13020,6 +13481,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
return -EINVAL;
+ if (kvm_is_gfn_alias(kvm, new->base_gfn + new->npages - 1))
+ return -EINVAL;
+
return kvm_alloc_memslot_metadata(kvm, new);
}
@@ -13036,7 +13500,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
int nr_slots;
- if (!kvm_x86_ops.cpu_dirty_log_size)
+ if (!kvm->arch.cpu_dirty_log_size)
return;
nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
@@ -13108,7 +13572,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (READ_ONCE(eager_page_split))
kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
- if (kvm_x86_ops.cpu_dirty_log_size) {
+ if (kvm->arch.cpu_dirty_log_size) {
kvm_mmu_slot_leaf_clear_dirty(kvm, new);
kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
} else {
@@ -13226,13 +13690,13 @@ unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
kvm_rip_read(vcpu));
}
-EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_linear_rip);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
{
return kvm_get_linear_rip(vcpu) == linear_rip;
}
-EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_linear_rip);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
{
@@ -13243,7 +13707,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
rflags &= ~X86_EFLAGS_TF;
return rflags;
}
-EXPORT_SYMBOL_GPL(kvm_get_rflags);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_rflags);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
@@ -13258,7 +13722,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
__kvm_set_rflags(vcpu, rflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_set_rflags);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_rflags);
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
@@ -13361,8 +13825,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
if (!kvm_pv_async_pf_enabled(vcpu))
return false;
- if (vcpu->arch.apf.send_user_only &&
- kvm_x86_call(get_cpl)(vcpu) == 0)
+ if (!vcpu->arch.apf.send_always &&
+ (vcpu->arch.guest_state_protected || !kvm_x86_call(get_cpl)(vcpu)))
return false;
if (is_guest_mode(vcpu)) {
@@ -13447,18 +13911,22 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
if ((work->wakeup_all || work->notpresent_injected) &&
kvm_pv_async_pf_enabled(vcpu) &&
!apf_put_user_ready(vcpu, work->arch.token)) {
- vcpu->arch.apf.pageready_pending = true;
+ WRITE_ONCE(vcpu->arch.apf.pageready_pending, true);
kvm_apic_set_irq(vcpu, &irq, NULL);
}
vcpu->arch.apf.halted = false;
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
}
void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
{
kvm_make_request(KVM_REQ_APF_READY, vcpu);
- if (!vcpu->arch.apf.pageready_pending)
+
+ /* Pairs with smp_store_mb() in kvm_set_msr_common(). */
+ smp_mb__after_atomic();
+
+ if (!READ_ONCE(vcpu->arch.apf.pageready_pending))
kvm_vcpu_kick(vcpu);
}
@@ -13470,25 +13938,6 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
-void kvm_arch_start_assignment(struct kvm *kvm)
-{
- if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
- kvm_x86_call(pi_start_assignment)(kvm);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
-
-void kvm_arch_end_assignment(struct kvm *kvm)
-{
- atomic_dec(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-
-bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
-{
- return raw_atomic_read(&kvm->arch.assigned_device_count);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
-
static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
{
/*
@@ -13497,8 +13946,10 @@ static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm)
* due to toggling the "ignore PAT" bit. Zap all SPTEs when the first
* (or last) non-coherent device is (un)registered to so that new SPTEs
* with the correct "ignore guest PAT" setting are created.
+ *
+ * If KVM always honors guest PAT, however, there is nothing to do.
*/
- if (kvm_mmu_may_ignore_guest_pat())
+ if (kvm_check_has_quirk(kvm, KVM_X86_QUIRK_IGNORE_GUEST_PAT))
kvm_zap_gfn_range(kvm, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
}
@@ -13507,93 +13958,34 @@ void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
if (atomic_inc_return(&kvm->arch.noncoherent_dma_count) == 1)
kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
-EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
{
if (!atomic_dec_return(&kvm->arch.noncoherent_dma_count))
kvm_noncoherent_dma_assignment_start_or_stop(kvm);
}
-EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
{
return atomic_read(&kvm->arch.noncoherent_dma_count);
}
-EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
-
-bool kvm_arch_has_irq_bypass(void)
-{
- return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
-}
-
-int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
- int ret;
-
- irqfd->producer = prod;
- kvm_arch_start_assignment(irqfd->kvm);
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
- if (ret)
- kvm_arch_end_assignment(irqfd->kvm);
-
- return ret;
-}
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_arch_has_noncoherent_dma);
-void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
- struct irq_bypass_producer *prod)
-{
- int ret;
- struct kvm_kernel_irqfd *irqfd =
- container_of(cons, struct kvm_kernel_irqfd, consumer);
-
- WARN_ON(irqfd->producer != prod);
- irqfd->producer = NULL;
-
- /*
- * When producer of consumer is unregistered, we change back to
- * remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabled or the consumer side (KVM
- * int this case doesn't want to receive the interrupts.
- */
- ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
- prod->irq, irqfd->gsi, 0);
- if (ret)
- printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
- " fails: %d\n", irqfd->consumer.token, ret);
-
- kvm_arch_end_assignment(irqfd->kvm);
-}
-
-int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- return kvm_x86_call(pi_update_irte)(kvm, host_irq, guest_irq, set);
-}
-
-bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
- struct kvm_kernel_irq_routing_entry *new)
-{
- if (new->type != KVM_IRQ_ROUTING_MSI)
- return true;
-
- return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
-}
-
-bool kvm_vector_hashing_enabled(void)
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
- return vector_hashing;
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
}
-bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+#ifdef CONFIG_KVM_GUEST_MEMFD
+/*
+ * KVM doesn't yet support initializing guest_memfd memory as shared for VMs
+ * with private memory (the private vs. shared tracking needs to be moved into
+ * guest_memfd).
+ */
+bool kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
{
- return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+ return !kvm_arch_has_private_mem(kvm);
}
-EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order)
@@ -13608,6 +14000,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
kvm_x86_call(gmem_invalidate)(start, end);
}
#endif
+#endif
int kvm_spec_ctrl_test_value(u64 value)
{
@@ -13622,18 +14015,18 @@ int kvm_spec_ctrl_test_value(u64 value)
local_irq_save(flags);
- if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ if (rdmsrq_safe(MSR_IA32_SPEC_CTRL, &saved_value))
ret = 1;
- else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ else if (wrmsrq_safe(MSR_IA32_SPEC_CTRL, value))
ret = 1;
else
- wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+ wrmsrq(MSR_IA32_SPEC_CTRL, saved_value);
local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_spec_ctrl_test_value);
void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
{
@@ -13658,7 +14051,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
}
vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
}
-EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error);
/*
* Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
@@ -13687,7 +14080,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_memory_failure);
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
{
@@ -13751,7 +14144,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
}
-EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_handle_invpcid);
static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
{
@@ -13836,7 +14229,7 @@ int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_write);
int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
void *data)
@@ -13874,7 +14267,7 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_mmio_read);
static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
{
@@ -13962,10 +14355,11 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
return in ? kvm_sev_es_ins(vcpu, size, port)
: kvm_sev_es_outs(vcpu, size, port);
}
-EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
+EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_sev_es_string_io);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
@@ -13982,7 +14376,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
@@ -13997,6 +14390,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_rmp_fault);
static int __init kvm_x86_init(void)
{
+ kvm_init_xstate_sizes();
+
kvm_mmu_x86_module_init();
mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
return 0;