summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/cpuid.c14
-rw-r--r--arch/x86/kvm/emulate.c8
-rw-r--r--arch/x86/kvm/hyperv.c29
-rw-r--r--arch/x86/kvm/hyperv.h2
-rw-r--r--arch/x86/kvm/ioapic.c12
-rw-r--r--arch/x86/kvm/ioapic.h4
-rw-r--r--arch/x86/kvm/lapic.c20
-rw-r--r--arch/x86/kvm/mmu.h114
-rw-r--r--arch/x86/kvm/mmu/mmu.c725
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h21
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h18
-rw-r--r--arch/x86/kvm/mmu/page_track.c53
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h206
-rw-r--r--arch/x86/kvm/mmu/spte.c34
-rw-r--r--arch/x86/kvm/mmu/spte.h21
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c119
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h6
-rw-r--r--arch/x86/kvm/svm/nested.c62
-rw-r--r--arch/x86/kvm/svm/sev.c127
-rw-r--r--arch/x86/kvm/svm/svm.c315
-rw-r--r--arch/x86/kvm/svm/svm.h22
-rw-r--r--arch/x86/kvm/svm/svm_ops.h4
-rw-r--r--arch/x86/kvm/trace.h9
-rw-r--r--arch/x86/kvm/vmx/evmcs.c12
-rw-r--r--arch/x86/kvm/vmx/evmcs.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c87
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c6
-rw-r--r--arch/x86/kvm/vmx/sgx.c16
-rw-r--r--arch/x86/kvm/vmx/vmx.c209
-rw-r--r--arch/x86/kvm/vmx/vmx.h7
-rw-r--r--arch/x86/kvm/x86.c1247
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/kvm/xen.c27
34 files changed, 1976 insertions, 1589 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ac69894eab88..619186138176 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -129,4 +129,7 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.
+config KVM_EXTERNAL_WRITE_TRACKING
+ bool
+
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index fe03bd978761..2d70edb0f323 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -53,9 +53,16 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
return ret;
}
+/*
+ * This one is tied to SSB in the user API, and not
+ * visible in /proc/cpuinfo.
+ */
+#define KVM_X86_FEATURE_PSFD (13*32+28) /* Predictive Store Forwarding Disable */
+
#define F feature_bit
#define SF(name) (boot_cpu_has(X86_FEATURE_##name) ? F(name) : 0)
+
static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index)
{
@@ -65,8 +72,8 @@ static inline struct kvm_cpuid_entry2 *cpuid_entry2_find(
for (i = 0; i < nent; i++) {
e = &entries[i];
- if (e->function == function && (e->index == index ||
- !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
+ if (e->function == function &&
+ (!(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) || e->index == index))
return e;
}
@@ -500,7 +507,8 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
- F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON)
+ F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) |
+ __feature_bit(KVM_X86_FEATURE_PSFD)
);
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2837110e66ed..28b1a4e57827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -435,7 +435,6 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
__FOP_RET(#op)
asm(".pushsection .fixup, \"ax\"\n"
- ".global kvm_fastop_exception \n"
"kvm_fastop_exception: xor %esi, %esi; ret\n"
".popsection");
@@ -4206,7 +4205,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
- return emulate_ud(ctxt);
+ return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
}
@@ -4223,6 +4222,11 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
if (enable_vmware_backdoor && is_vmware_backdoor_pmc(rcx))
return X86EMUL_CONTINUE;
+ /*
+ * If CR4.PCE is set, the SDM requires CPL=0 or CR0.PE=0. The CR0.PE
+ * check however is unnecessary because CPL is always 0 outside
+ * protected mode.
+ */
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 232a86a6faaf..4f15c0165c05 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -112,7 +112,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
if (!!auto_eoi_old == !!auto_eoi_new)
return;
- mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ down_write(&vcpu->kvm->arch.apicv_update_lock);
if (auto_eoi_new)
hv->synic_auto_eoi_used++;
@@ -123,7 +123,7 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
!hv->synic_auto_eoi_used,
APICV_INHIBIT_REASON_HYPERV);
- mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
+ up_write(&vcpu->kvm->arch.apicv_update_lock);
}
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -939,7 +939,7 @@ static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
stimer_init(&hv_vcpu->stimer[i], i);
- hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu);
+ hv_vcpu->vp_index = vcpu->vcpu_idx;
return 0;
}
@@ -1444,7 +1444,6 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
switch (msr) {
case HV_X64_MSR_VP_INDEX: {
struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
- int vcpu_idx = kvm_vcpu_get_idx(vcpu);
u32 new_vp_index = (u32)data;
if (!host || new_vp_index >= KVM_MAX_VCPUS)
@@ -1459,9 +1458,9 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
* VP index is changing, adjust num_mismatched_vp_indexes if
* it now matches or no longer matches vcpu_idx.
*/
- if (hv_vcpu->vp_index == vcpu_idx)
+ if (hv_vcpu->vp_index == vcpu->vcpu_idx)
atomic_inc(&hv->num_mismatched_vp_indexes);
- else if (new_vp_index == vcpu_idx)
+ else if (new_vp_index == vcpu->vcpu_idx)
atomic_dec(&hv->num_mismatched_vp_indexes);
hv_vcpu->vp_index = new_vp_index;
@@ -1755,7 +1754,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
int i;
gpa_t gpa;
struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct hv_tlb_flush_ex flush_ex;
struct hv_tlb_flush flush;
u64 vp_bitmap[KVM_HV_MAX_SPARSE_VCPU_SET_BITS];
@@ -1837,18 +1835,19 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
}
}
- cpumask_clear(&hv_vcpu->tlb_flush);
-
- vcpu_mask = all_cpus ? NULL :
- sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
- vp_bitmap, vcpu_bitmap);
-
/*
* vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't
* analyze it here, flush TLB regardless of the specified address space.
*/
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
- NULL, vcpu_mask, &hv_vcpu->tlb_flush);
+ if (all_cpus) {
+ kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH_GUEST);
+ } else {
+ vcpu_mask = sparse_set_to_vcpu_mask(kvm, sparse_banks, valid_bank_mask,
+ vp_bitmap, vcpu_bitmap);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_TLB_FLUSH_GUEST,
+ vcpu_mask);
+ }
ret_success:
/* We always do full TLB flush, set 'Reps completed' = 'Rep Count' */
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 730da8537d05..ed1c4e546d04 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -83,7 +83,7 @@ static inline u32 kvm_hv_get_vpindex(struct kvm_vcpu *vcpu)
{
struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
- return hv_vcpu ? hv_vcpu->vp_index : kvm_vcpu_get_idx(vcpu);
+ return hv_vcpu ? hv_vcpu->vp_index : vcpu->vcpu_idx;
}
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index ff005fe738a4..816a82515dcd 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -96,7 +96,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_IDS);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -319,8 +319,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
unsigned index;
bool mask_before, mask_after;
union kvm_ioapic_redirect_entry *e;
- unsigned long vcpu_bitmap;
int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode;
+ DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS);
switch (ioapic->ioregsel) {
case IOAPIC_REG_VERSION:
@@ -384,9 +384,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
irq.shorthand = APIC_DEST_NOSHORT;
irq.dest_id = e->fields.dest_id;
irq.msi_redir_hint = false;
- bitmap_zero(&vcpu_bitmap, 16);
+ bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
- &vcpu_bitmap);
+ vcpu_bitmap);
if (old_dest_mode != e->fields.dest_mode ||
old_dest_id != e->fields.dest_id) {
/*
@@ -399,10 +399,10 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
kvm_lapic_irq_dest_mode(
!!e->fields.dest_mode);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
- &vcpu_bitmap);
+ vcpu_bitmap);
}
kvm_make_scan_ioapic_request_mask(ioapic->kvm,
- &vcpu_bitmap);
+ vcpu_bitmap);
} else {
kvm_make_scan_ioapic_request(ioapic->kvm);
}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index bbd4a5d18b5d..e66e620c3bed 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -39,13 +39,13 @@ struct kvm_vcpu;
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_IDS);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID + 1];
+ u8 vectors[KVM_MAX_VCPU_IDS];
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 76fb00921203..d6ac32f3f650 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2321,13 +2321,14 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 msr_val;
int i;
if (!init_event) {
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
+ msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ msr_val |= MSR_IA32_APICBASE_BSP;
+ kvm_lapic_set_base(vcpu, msr_val);
}
if (!apic)
@@ -2336,11 +2337,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!init_event) {
- apic->base_address = APIC_DEFAULT_PHYS_BASE;
-
+ /* The xAPIC ID is set at RESET even if the APIC was already enabled. */
+ if (!init_event)
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
- }
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -2481,6 +2480,11 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
lapic_timer_advance_dynamic = false;
}
+ /*
+ * Stuff the APIC ENABLE bit in lieu of temporarily incrementing
+ * apic_hw_disabled; the full RESET value is set by kvm_lapic_reset().
+ */
+ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -2942,5 +2946,7 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
void kvm_lapic_exit(void)
{
static_key_deferred_flush(&apic_hw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_hw_disabled.key));
static_key_deferred_flush(&apic_sw_disabled);
+ WARN_ON(static_branch_unlikely(&apic_sw_disabled.key));
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e9688a9f7b57..9ae6168d381e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,9 +44,8 @@
#define PT32_ROOT_LEVEL 2
#define PT32E_ROOT_LEVEL 3
-#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | \
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE | \
- X86_CR4_LA57)
+#define KVM_MMU_CR4_ROLE_BITS (X86_CR4_PSE | X86_CR4_PAE | X86_CR4_LA57 | \
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE)
#define KVM_MMU_CR0_ROLE_BITS (X86_CR0_PG | X86_CR0_WP)
@@ -80,6 +79,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu);
static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
{
@@ -114,17 +114,91 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->shadow_root_level);
}
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault);
+struct kvm_page_fault {
+ /* arguments to kvm_mmu_do_page_fault. */
+ const gpa_t addr;
+ const u32 error_code;
+ const bool prefetch;
+
+ /* Derived from error_code. */
+ const bool exec;
+ const bool write;
+ const bool present;
+ const bool rsvd;
+ const bool user;
+
+ /* Derived from mmu and global state. */
+ const bool is_tdp;
+ const bool nx_huge_page_workaround_enabled;
+
+ /*
+ * Whether a >4KB mapping can be created or is forbidden due to NX
+ * hugepages.
+ */
+ bool huge_page_disallowed;
+
+ /*
+ * Maximum page size that can be created for this fault; input to
+ * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ */
+ u8 max_level;
+
+ /*
+ * Page size that can be created based on the max_level and the
+ * page size used by the host mapping.
+ */
+ u8 req_level;
+
+ /*
+ * Page size that will be created based on the req_level and
+ * huge_page_disallowed.
+ */
+ u8 goal_level;
+
+ /* Shifted addr, or result of guest page table walk if addr is a gva. */
+ gfn_t gfn;
+
+ /* The memslot containing gfn. May be NULL. */
+ struct kvm_memory_slot *slot;
+
+ /* Outputs of kvm_faultin_pfn. */
+ kvm_pfn_t pfn;
+ hva_t hva;
+ bool map_writable;
+};
+
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+
+extern int nx_huge_pages;
+static inline bool is_nx_huge_page_enabled(void)
+{
+ return READ_ONCE(nx_huge_pages);
+}
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefault)
+ u32 err, bool prefetch)
{
+ struct kvm_page_fault fault = {
+ .addr = cr2_or_gpa,
+ .error_code = err,
+ .exec = err & PFERR_FETCH_MASK,
+ .write = err & PFERR_WRITE_MASK,
+ .present = err & PFERR_PRESENT_MASK,
+ .rsvd = err & PFERR_RSVD_MASK,
+ .user = err & PFERR_USER_MASK,
+ .prefetch = prefetch,
+ .is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
+ .nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(),
+
+ .max_level = KVM_MAX_HUGEPAGE_LEVEL,
+ .req_level = PG_LEVEL_4K,
+ .goal_level = PG_LEVEL_4K,
+ };
#ifdef CONFIG_RETPOLINE
- if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
- return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
+ if (fault.is_tdp)
+ return kvm_tdp_page_fault(vcpu, &fault);
#endif
- return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
+ return vcpu->arch.mmu->page_fault(vcpu, &fault);
}
/*
@@ -230,14 +304,26 @@ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
-static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
{
/*
- * Read memslot_have_rmaps before rmap pointers. Hence, threads reading
- * memslots_have_rmaps in any lock context are guaranteed to see the
- * pointers. Pairs with smp_store_release in alloc_all_memslots_rmaps.
+ * Read shadow_root_allocated before related pointers. Hence, threads
+ * reading shadow_root_allocated in any lock context are guaranteed to
+ * see the pointers. Pairs with smp_store_release in
+ * mmu_first_shadow_root_alloc.
*/
- return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
+ return smp_load_acquire(&kvm->arch.shadow_root_allocated);
+}
+
+#ifdef CONFIG_X86_64
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+#else
+static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#endif
+
+static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
+{
+ return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
}
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2d7e61122af8..323b5057d08f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -58,6 +58,7 @@
extern bool itlb_multihit_kvm_mitigation;
int __read_mostly nx_huge_pages = -1;
+static uint __read_mostly nx_huge_pages_recovery_period_ms;
#ifdef CONFIG_PREEMPT_RT
/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */
static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
@@ -66,23 +67,26 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
#endif
static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops nx_huge_pages_ops = {
.set = set_nx_huge_pages,
.get = param_get_bool,
};
-static const struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
- .set = set_nx_huge_pages_recovery_ratio,
+static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
+ .set = set_nx_huge_pages_recovery_param,
.get = param_get_uint,
};
module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
__MODULE_PARM_TYPE(nx_huge_pages, "bool");
-module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
+module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
&nx_huge_pages_recovery_ratio, 0644);
__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
+module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
+ &nx_huge_pages_recovery_period_ms, 0644);
+__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
static bool __read_mostly force_flush_and_sync_on_reuse;
module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
@@ -1071,20 +1075,6 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
return kvm_mmu_memory_cache_nr_free_objects(mc);
}
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
-{
- struct kvm_memory_slot *slot;
- struct kvm_mmu_page *sp;
- struct kvm_rmap_head *rmap_head;
-
- sp = sptep_to_sp(spte);
- kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
- return pte_list_add(vcpu, spte, rmap_head);
-}
-
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_memslots *slots;
@@ -1097,9 +1087,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
/*
- * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
- * context of a vCPU so have to determine which memslots to use based
- * on context information in sp->role.
+ * Unlike rmap_add, rmap_remove does not run in the context of a vCPU
+ * so we have to determine which memslots to use based on context
+ * information in sp->role.
*/
slots = kvm_memslots_for_spte_role(kvm, sp->role);
@@ -1639,19 +1629,23 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
#define RMAP_RECYCLE_THRESHOLD 1000
-static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *spte, gfn_t gfn)
{
- struct kvm_memory_slot *slot;
- struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
+ struct kvm_rmap_head *rmap_head;
+ int rmap_count;
sp = sptep_to_sp(spte);
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+ rmap_count = pte_list_add(vcpu, spte, rmap_head);
- kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
+ kvm_flush_remote_tlbs_with_address(
+ vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ }
}
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1795,7 +1789,7 @@ static void mark_unsync(u64 *spte)
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
- return 0;
+ return -1;
}
#define KVM_PAGE_ARRAY_NR 16
@@ -1909,12 +1903,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
- if (vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
+ int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
+
+ if (ret < 0) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return false;
}
- return true;
+ return !!ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1931,17 +1927,6 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
return true;
}
-static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
- struct list_head *invalid_list,
- bool remote_flush, bool local_flush)
-{
- if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
- return;
-
- if (local_flush)
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
-}
-
#ifdef CONFIG_KVM_MMU_AUDIT
#include "mmu_audit.c"
#else
@@ -2027,8 +2012,8 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
} while (!sp->unsync_children);
}
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
+static int mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent, bool can_yield)
{
int i;
struct kvm_mmu_page *sp;
@@ -2044,7 +2029,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
protected |= rmap_write_protect(vcpu, sp->gfn);
if (protected) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true);
flush = false;
}
@@ -2054,13 +2039,19 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ if (!can_yield) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ return -EINTR;
+ }
+
cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
flush = false;
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
+ return 0;
}
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
@@ -2143,12 +2134,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
- if (sp->unsync_children)
- kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
-
__clear_sp_write_flooding_count(sp);
trace_get_page:
@@ -2226,7 +2214,7 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
u64 spte)
{
- if (is_last_spte(spte, iterator->level)) {
+ if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}
@@ -2588,7 +2576,8 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
* were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
* be write-protected.
*/
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch)
{
struct kvm_mmu_page *sp;
bool locked = false;
@@ -2598,7 +2587,7 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
* track machinery is used to write-protect upper-level shadow pages,
* i.e. this guards the role.level == 4K assertion below!
*/
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(vcpu, slot, gfn, KVM_PAGE_TRACK_WRITE))
return -EPERM;
/*
@@ -2614,6 +2603,9 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
if (sp->unsync)
continue;
+ if (prefetch)
+ return -EEXIST;
+
/*
* TDP MMU page faults require an additional spinlock as they
* run with mmu_lock held for read, not write, and the unsync
@@ -2677,48 +2669,30 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync)
* (sp->unsync = true)
*
* The write barrier below ensures that 1.1 happens before 1.2 and thus
- * the situation in 2.4 does not arise. The implicit barrier in 2.2
- * pairs with this write barrier.
+ * the situation in 2.4 does not arise. It pairs with the read barrier
+ * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
*/
smp_wmb();
return 0;
}
-static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool can_unsync, bool host_writable)
-{
- u64 spte;
- struct kvm_mmu_page *sp;
- int ret;
-
- sp = sptep_to_sp(sptep);
-
- ret = make_spte(vcpu, pte_access, level, gfn, pfn, *sptep, speculative,
- can_unsync, host_writable, sp_ad_disabled(sp), &spte);
-
- if (spte & PT_WRITABLE_MASK)
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
-
- if (*sptep == spte)
- ret |= SET_SPTE_SPURIOUS;
- else if (mmu_spte_update(sptep, spte))
- ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
- return ret;
-}
-
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned int pte_access, bool write_fault, int level,
- gfn_t gfn, kvm_pfn_t pfn, bool speculative,
- bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ u64 *sptep, unsigned int pte_access, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_page_fault *fault)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ int level = sp->role.level;
int was_rmapped = 0;
- int rmap_count;
- int set_spte_ret;
int ret = RET_PF_FIXED;
bool flush = false;
+ bool wrprot;
+ u64 spte;
+
+ /* Prefetching always gets a writable pfn. */
+ bool host_writable = !fault || fault->map_writable;
+ bool prefetch = !fault || fault->prefetch;
+ bool write_fault = fault && fault->write;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2749,52 +2723,36 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
was_rmapped = 1;
}
- set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
- speculative, true, host_writable);
- if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
+ wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch,
+ true, host_writable, &spte);
+
+ if (*sptep == spte) {
+ ret = RET_PF_SPURIOUS;
+ } else {
+ trace_kvm_mmu_set_spte(level, gfn, sptep);
+ flush |= mmu_spte_update(sptep, spte);
+ }
+
+ if (wrprot) {
if (write_fault)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
+ if (flush)
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
KVM_PAGES_PER_HPAGE(level));
- /*
- * The fault is fully spurious if and only if the new SPTE and old SPTE
- * are identical, and emulation is not required.
- */
- if ((set_spte_ret & SET_SPTE_SPURIOUS) && ret == RET_PF_FIXED) {
- WARN_ON_ONCE(!was_rmapped);
- return RET_PF_SPURIOUS;
- }
-
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
- trace_kvm_mmu_set_spte(level, gfn, sptep);
if (!was_rmapped) {
+ WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
kvm_update_page_stats(vcpu->kvm, level, 1);
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
+ rmap_add(vcpu, slot, sptep, gfn);
}
return ret;
}
-static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
- bool no_dirty_log)
-{
- struct kvm_memory_slot *slot;
-
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot)
- return KVM_PFN_ERR_FAULT;
-
- return gfn_to_pfn_memslot_atomic(slot, gfn);
-}
-
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2815,8 +2773,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++) {
- mmu_set_spte(vcpu, start, access, false, sp->role.level, gfn,
- page_to_pfn(pages[i]), true, true);
+ mmu_set_spte(vcpu, slot, start, access, gfn,
+ page_to_pfn(pages[i]), NULL);
put_page(pages[i]);
}
@@ -2839,11 +2797,13 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
- break;
+ return;
start = NULL;
} else if (!start)
start = spte;
}
+ if (start)
+ direct_pte_prefetch_many(vcpu, sp, start, spte);
}
static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -2921,52 +2881,46 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
return min(host_level, max_level);
}
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level)
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot;
- kvm_pfn_t pfn = *pfnp;
+ struct kvm_memory_slot *slot = fault->slot;
kvm_pfn_t mask;
- int level;
- *req_level = PG_LEVEL_4K;
+ fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
- if (unlikely(max_level == PG_LEVEL_4K))
- return PG_LEVEL_4K;
+ if (unlikely(fault->max_level == PG_LEVEL_4K))
+ return;
- if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
- return PG_LEVEL_4K;
+ if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn))
+ return;
- slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
- if (!slot)
- return PG_LEVEL_4K;
+ if (kvm_slot_dirty_track_enabled(slot))
+ return;
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K || huge_page_disallowed)
- return PG_LEVEL_4K;
+ fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot,
+ fault->gfn, fault->pfn,
+ fault->max_level);
+ if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed)
+ return;
/*
* mmu_notifier_retry() was successful and mmu_lock is held, so
* the pmd can't be split from under us.
*/
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- *pfnp = pfn & ~mask;
-
- return level;
+ fault->goal_level = fault->req_level;
+ mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1;
+ VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
+ fault->pfn &= ~mask;
}
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp)
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level)
{
- int level = *goal_levelp;
-
- if (cur_level == level && level > PG_LEVEL_4K &&
+ if (cur_level > PG_LEVEL_4K &&
+ cur_level == fault->goal_level &&
is_shadow_present_pte(spte) &&
!is_large_pte(spte)) {
/*
@@ -2976,42 +2930,33 @@ void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
* patching back for them into pfn the next 9 bits of
* the address.
*/
- u64 page_mask = KVM_PAGES_PER_HPAGE(level) -
- KVM_PAGES_PER_HPAGE(level - 1);
- *pfnp |= gfn & page_mask;
- (*goal_levelp)--;
+ u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
+ KVM_PAGES_PER_HPAGE(cur_level - 1);
+ fault->pfn |= fault->gfn & page_mask;
+ fault->goal_level--;
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault, bool is_tdp)
+static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int level, req_level, ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- gfn_t base_gfn = gfn;
+ int ret;
+ gfn_t base_gfn = fault->gfn;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
- for_each_shadow_entry(vcpu, gpa, it) {
+ trace_kvm_mmu_spte_requested(fault);
+ for_each_shadow_entry(vcpu, fault->addr, it) {
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
drop_large_spte(vcpu, it.sptep);
@@ -3022,14 +2967,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (is_tdp && huge_page_disallowed &&
- req_level >= it.level)
+ if (fault->is_tdp && fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
- ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
- write, level, base_gfn, pfn, prefault,
- map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -3061,18 +3008,19 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
}
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access,
- int *ret_val)
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access, int *ret_val)
{
/* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(pfn))) {
- *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ if (unlikely(is_error_pfn(fault->pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn);
return true;
}
- if (unlikely(is_noslot_pfn(pfn))) {
- vcpu_cache_mmio_info(vcpu, gva, gfn,
+ if (unlikely(!fault->slot)) {
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
+
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
/*
* If MMIO caching is disabled, emulate immediately without
@@ -3088,18 +3036,17 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
return false;
}
-static bool page_fault_can_be_fast(u32 error_code)
+static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
{
/*
* Do not fix the mmio spte with invalid generation number which
* need to be updated by slow page fault path.
*/
- if (unlikely(error_code & PFERR_RSVD_MASK))
+ if (fault->rsvd)
return false;
/* See if the page fault is due to an NX violation */
- if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
- == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ if (unlikely(fault->exec && fault->present))
return false;
/*
@@ -3116,9 +3063,7 @@ static bool page_fault_can_be_fast(u32 error_code)
* accesses to a present page.
*/
- return shadow_acc_track_mask != 0 ||
- ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
- == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
+ return shadow_acc_track_mask != 0 || (fault->write && fault->present);
}
/*
@@ -3126,13 +3071,9 @@ static bool page_fault_can_be_fast(u32 error_code)
* someone else modified the SPTE from its original value.
*/
static bool
-fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, u64 new_spte)
{
- gfn_t gfn;
-
- WARN_ON(!sp->role.direct);
-
/*
* Theoretically we could also set dirty bit (and flush TLB) here in
* order to eliminate unnecessary PML logging. See comments in
@@ -3148,24 +3089,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
return false;
- if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
- /*
- * The gfn of direct spte is stable since it is
- * calculated by sp->gfn.
- */
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
- }
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte))
+ mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn);
return true;
}
-static bool is_access_allowed(u32 fault_err_code, u64 spte)
+static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
{
- if (fault_err_code & PFERR_FETCH_MASK)
+ if (fault->exec)
return is_executable_pte(spte);
- if (fault_err_code & PFERR_WRITE_MASK)
+ if (fault->write)
return is_writable_pte(spte);
/* Fault was on Read access */
@@ -3190,9 +3125,6 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
sptep = iterator.sptep;
*spte = old_spte;
-
- if (!is_shadow_present_pte(old_spte))
- break;
}
return sptep;
@@ -3201,7 +3133,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
/*
* Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
*/
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
+static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
@@ -3209,7 +3141,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
u64 *sptep = NULL;
uint retry_count = 0;
- if (!page_fault_can_be_fast(error_code))
+ if (!page_fault_can_be_fast(fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
@@ -3218,9 +3150,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
u64 new_spte;
if (is_tdp_mmu(vcpu->arch.mmu))
- sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
else
- sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
if (!is_shadow_present_pte(spte))
break;
@@ -3239,7 +3171,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* Need not check the access of upper level table entries since
* they are always ACC_ALL.
*/
- if (is_access_allowed(error_code, spte)) {
+ if (is_access_allowed(fault, spte)) {
ret = RET_PF_SPURIOUS;
break;
}
@@ -3254,7 +3186,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking.
*/
- if ((error_code & PFERR_WRITE_MASK) &&
+ if (fault->write &&
spte_can_locklessly_be_made_writable(spte)) {
new_spte |= PT_WRITABLE_MASK;
@@ -3275,7 +3207,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
/* Verify that the fault can be handled in the fast path */
if (new_spte == spte ||
- !is_access_allowed(error_code, new_spte))
+ !is_access_allowed(fault, new_spte))
break;
/*
@@ -3283,7 +3215,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
+ if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) {
ret = RET_PF_FIXED;
break;
}
@@ -3296,7 +3228,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
} while (true);
- trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
+ trace_fast_page_fault(vcpu, fault, sptep, spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
@@ -3469,6 +3401,67 @@ out_unlock:
return r;
}
+static int mmu_first_shadow_root_alloc(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r = 0, i;
+
+ /*
+ * Check if this is the first shadow root being allocated before
+ * taking the lock.
+ */
+ if (kvm_shadow_root_allocated(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /* Recheck, under the lock, whether this is the first shadow root. */
+ if (kvm_shadow_root_allocated(kvm))
+ goto out_unlock;
+
+ /*
+ * Check if anything actually needs to be allocated, e.g. all metadata
+ * will be allocated upfront if TDP is disabled.
+ */
+ if (kvm_memslots_have_rmaps(kvm) &&
+ kvm_page_track_write_tracking_enabled(kvm))
+ goto out_success;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, slots) {
+ /*
+ * Both of these functions are no-ops if the target is
+ * already allocated, so unconditionally calling both
+ * is safe. Intentionally do NOT free allocations on
+ * failure to avoid having to track which allocations
+ * were made now versus when the memslot was created.
+ * The metadata is guaranteed to be freed when the slot
+ * is freed, and will be kept/used if userspace retries
+ * KVM_RUN instead of killing the VM.
+ */
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r)
+ goto out_unlock;
+ r = kvm_page_track_write_tracking_alloc(slot);
+ if (r)
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Ensure that shadow_root_allocated becomes true strictly after
+ * all the related pointers are set.
+ */
+out_success:
+ smp_store_release(&kvm->arch.shadow_root_allocated, true);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -3499,7 +3492,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}
- r = alloc_all_memslots_rmaps(vcpu->kvm);
+ r = mmu_first_shadow_root_alloc(vcpu->kvm);
if (r)
return r;
@@ -3650,6 +3643,33 @@ err_pml4:
#endif
}
+static bool is_unsync_root(hpa_t root)
+{
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(root))
+ return false;
+
+ /*
+ * The read barrier orders the CPU's read of SPTE.W during the page table
+ * walk before the reads of sp->unsync/sp->unsync_children here.
+ *
+ * Even if another CPU was marking the SP as unsync-ed simultaneously,
+ * any guest page table changes are not guaranteed to be visible anyway
+ * until this VCPU issues a TLB flush strictly after those changes are
+ * made. We only need to ensure that the other CPU sets these flags
+ * before any actual changes to the page tables are made. The comments
+ * in mmu_try_to_unsync_pages() describe what could go wrong if this
+ * requirement isn't satisfied.
+ */
+ smp_rmb();
+ sp = to_shadow_page(root);
+ if (sp->unsync || sp->unsync_children)
+ return true;
+
+ return false;
+}
+
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
int i;
@@ -3667,24 +3687,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu->root_hpa;
sp = to_shadow_page(root);
- /*
- * Even if another CPU was marking the SP as unsync-ed
- * simultaneously, any guest page table changes are not
- * guaranteed to be visible anyway until this VCPU issues a TLB
- * flush strictly after those changes are made. We only need to
- * ensure that the other CPU sets these flags before any actual
- * changes to the page tables are made. The comments in
- * mmu_try_to_unsync_pages() describe what could go wrong if
- * this requirement isn't satisfied.
- */
- if (!smp_load_acquire(&sp->unsync) &&
- !smp_load_acquire(&sp->unsync_children))
+ if (!is_unsync_root(root))
return;
write_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
- mmu_sync_children(vcpu, sp);
+ mmu_sync_children(vcpu, sp, true);
kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
write_unlock(&vcpu->kvm->mmu_lock);
@@ -3700,7 +3709,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
if (IS_VALID_PAE_ROOT(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = to_shadow_page(root);
- mmu_sync_children(vcpu, sp);
+ mmu_sync_children(vcpu, sp, true);
}
}
@@ -3708,6 +3717,19 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
write_unlock(&vcpu->kvm->mmu_lock);
}
+void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
+{
+ unsigned long roots_to_free = 0;
+ int i;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ /* sync prev_roots by simply freeing them */
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
{
@@ -3760,9 +3782,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
spte = mmu_spte_get_lockless(iterator.sptep);
sptes[leaf] = spte;
-
- if (!is_shadow_present_pte(spte))
- break;
}
return leaf;
@@ -3853,20 +3872,19 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
- u32 error_code, gfn_t gfn)
+ struct kvm_page_fault *fault)
{
- if (unlikely(error_code & PFERR_RSVD_MASK))
+ if (unlikely(fault->rsvd))
return false;
- if (!(error_code & PFERR_PRESENT_MASK) ||
- !(error_code & PFERR_WRITE_MASK))
+ if (!fault->present || !fault->write)
return false;
/*
* guest is writing the page which is write tracked which can
* not be fixed by page fault handler.
*/
- if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ if (kvm_slot_page_track_is_active(vcpu, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE))
return true;
return false;
@@ -3878,11 +3896,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
u64 spte;
walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
clear_sp_write_flooding_count(iterator.sptep);
- if (!is_shadow_present_pte(spte))
- break;
- }
walk_shadow_page_lockless_end(vcpu);
}
@@ -3900,11 +3915,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable, int *r)
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
{
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ struct kvm_memory_slot *slot = fault->slot;
bool async;
/*
@@ -3918,8 +3931,9 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (!kvm_is_visible_memslot(slot)) {
/* Don't expose private memslots to L2. */
if (is_guest_mode(vcpu)) {
- *pfn = KVM_PFN_NOSLOT;
- *writable = false;
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
return false;
}
/*
@@ -3936,46 +3950,46 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
async = false;
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async,
- write, writable, hva);
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
+ fault->write, &fault->map_writable,
+ &fault->hva);
if (!async)
return false; /* *pfn has correct page already */
- if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
- if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
+ if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(fault->addr, fault->gfn);
+ if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) {
+ trace_kvm_async_pf_doublefault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
goto out_retry;
- } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
+ } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn))
goto out_retry;
}
- *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
- write, writable, hva);
+ fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL,
+ fault->write, &fault->map_writable,
+ &fault->hva);
+ return false;
out_retry:
*r = RET_PF_RETRY;
return true;
}
-static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault, int max_level, bool is_tdp)
+static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
- bool write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
- gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
- kvm_pfn_t pfn;
- hva_t hva;
int r;
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ fault->gfn = fault->addr >> PAGE_SHIFT;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
- r = fast_page_fault(vcpu, gpa, error_code);
+ r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
return r;
@@ -3986,11 +4000,10 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
- write, &map_writable, &r))
+ if (kvm_faultin_pfn(vcpu, fault, &r))
return r;
- if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;
@@ -4000,36 +4013,34 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
else
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
if (is_tdp_mmu_fault)
- r = kvm_tdp_mmu_map(vcpu, gpa, error_code, map_writable, max_level,
- pfn, prefault);
+ r = kvm_tdp_mmu_map(vcpu, fault);
else
- r = __direct_map(vcpu, gpa, error_code, map_writable, max_level, pfn,
- prefault, is_tdp);
+ r = __direct_map(vcpu, fault);
out_unlock:
if (is_tdp_mmu_fault)
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
- u32 error_code, bool prefault)
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
- pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code);
/* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
- return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
- PG_LEVEL_2M, false);
+ fault->max_level = PG_LEVEL_2M;
+ return direct_page_fault(vcpu, fault);
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
@@ -4065,23 +4076,19 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
-int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- bool prefault)
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- int max_level;
-
- for (max_level = KVM_MAX_HUGEPAGE_LEVEL;
- max_level > PG_LEVEL_4K;
- max_level--) {
- int page_num = KVM_PAGES_PER_HPAGE(max_level);
- gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
+ while (fault->max_level > PG_LEVEL_4K) {
+ int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
+ gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
+
+ --fault->max_level;
}
- return direct_page_fault(vcpu, gpa, error_code, prefault,
- max_level, true);
+ return direct_page_fault(vcpu, fault);
}
static void nonpaging_init_context(struct kvm_mmu *context)
@@ -4202,7 +4209,7 @@ static unsigned long get_cr3(struct kvm_vcpu *vcpu)
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- unsigned int access, int *nr_present)
+ unsigned int access)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -4210,7 +4217,6 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
return true;
}
- (*nr_present)++;
mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
@@ -4593,10 +4599,10 @@ static void update_pkru_bitmask(struct kvm_mmu *mmu)
unsigned bit;
bool wp;
- if (!is_cr4_pke(mmu)) {
- mmu->pkru_mask = 0;
+ mmu->pkru_mask = 0;
+
+ if (!is_cr4_pke(mmu))
return;
- }
wp = is_cr0_wp(mmu);
@@ -5209,7 +5215,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush;
+ bool flush = false;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -5218,8 +5224,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return;
- remote_flush = local_flush = false;
-
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
/*
@@ -5248,18 +5252,17 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (!spte)
continue;
- local_flush = true;
while (npte--) {
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL);
if (gentry && sp->role.level != PG_LEVEL_4K)
++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
- remote_flush = true;
+ flush = true;
++spte;
}
}
- kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
+ kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush);
kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
write_unlock(&vcpu->kvm->mmu_lock);
}
@@ -5470,8 +5473,8 @@ slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool flush_on_yield)
+slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
PG_LEVEL_4K, flush_on_yield);
@@ -5691,13 +5694,7 @@ void kvm_mmu_init_vm(struct kvm *kvm)
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- if (!kvm_mmu_init_tdp_mmu(kvm))
- /*
- * No smp_load/store wrappers needed here as we are in
- * VM init and there cannot be any memslots / other threads
- * accessing this struct kvm yet.
- */
- kvm->arch.memslots_have_rmaps = true;
+ kvm_mmu_init_tdp_mmu(kvm);
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -5713,55 +5710,58 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_mmu_uninit_tdp_mmu(kvm);
}
+static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ const struct kvm_memory_slot *memslot;
+ struct kvm_memslots *slots;
+ bool flush = false;
+ gfn_t start, end;
+ int i;
+
+ if (!kvm_memslots_have_rmaps(kvm))
+ return flush;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
+
+ flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true, flush);
+ }
+ }
+
+ return flush;
+}
+
/*
* Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
* (not including it)
*/
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *memslot;
+ bool flush;
int i;
- bool flush = false;
write_lock(&kvm->mmu_lock);
kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
- if (kvm_memslots_have_rmaps(kvm)) {
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots) {
- gfn_t start, end;
-
- start = max(gfn_start, memslot->base_gfn);
- end = min(gfn_end, memslot->base_gfn + memslot->npages);
- if (start >= end)
- continue;
-
- flush = slot_handle_level_range(kvm,
- (const struct kvm_memory_slot *) memslot,
- kvm_zap_rmapp, PG_LEVEL_4K,
- KVM_MAX_HUGEPAGE_LEVEL, start,
- end - 1, true, flush);
- }
- }
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
- }
+ flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end);
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush);
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start);
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
@@ -5857,7 +5857,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
+ /*
+ * Zap only 4k SPTEs since the legacy MMU only supports dirty
+ * logging at a 4k granularity and never creates collapsible
+ * 2m SPTEs during dirty logging.
+ */
+ flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
@@ -5894,8 +5899,11 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty,
- false);
+ /*
+ * Clear dirty bits only on 4k SPTEs since the legacy MMU only
+ * support dirty logging at a 4k granularity.
+ */
+ flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false);
write_unlock(&kvm->mmu_lock);
}
@@ -6173,18 +6181,24 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
-static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
+static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
- unsigned int old_val;
+ bool was_recovery_enabled, is_recovery_enabled;
+ uint old_period, new_period;
int err;
- old_val = nx_huge_pages_recovery_ratio;
+ was_recovery_enabled = nx_huge_pages_recovery_ratio;
+ old_period = nx_huge_pages_recovery_period_ms;
+
err = param_set_uint(val, kp);
if (err)
return err;
- if (READ_ONCE(nx_huge_pages) &&
- !old_val && nx_huge_pages_recovery_ratio) {
+ is_recovery_enabled = nx_huge_pages_recovery_ratio;
+ new_period = nx_huge_pages_recovery_period_ms;
+
+ if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
+ (!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
mutex_lock(&kvm_lock);
@@ -6247,8 +6261,17 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
- return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
- ? start_time + 60 * HZ - get_jiffies_64()
+ uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
+ uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
+
+ if (!period && ratio) {
+ /* Make sure the period is not less than one second. */
+ ratio = min(ratio, 3600u);
+ period = 60 * 60 * 1000 / ratio;
+ }
+
+ return READ_ONCE(nx_huge_pages) && ratio
+ ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
: MAX_SCHEDULE_TIMEOUT;
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index bf2bdbf333c2..52c6527b1a06 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -118,13 +118,8 @@ static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
kvm_x86_ops.cpu_dirty_log_size;
}
-extern int nx_huge_pages;
-static inline bool is_nx_huge_page_enabled(void)
-{
- return READ_ONCE(nx_huge_pages);
-}
-
-int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
+int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
+ gfn_t gfn, bool can_unsync, bool prefetch);
void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
@@ -155,19 +150,11 @@ enum {
RET_PF_SPURIOUS,
};
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
-
int kvm_mmu_max_mapping_level(struct kvm *kvm,
const struct kvm_memory_slot *slot, gfn_t gfn,
kvm_pfn_t pfn, int max_level);
-int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
- int max_level, kvm_pfn_t *pfnp,
- bool huge_page_disallowed, int *req_level);
-void disallowed_hugepage_adjust(u64 spte, gfn_t gfn, int cur_level,
- kvm_pfn_t *pfnp, int *goal_levelp);
+void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
+void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 2924a4081a19..b8151bbca36a 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -252,9 +252,9 @@ TRACE_EVENT(
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
u64 *sptep, u64 old_spte, int ret),
- TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, ret),
+ TP_ARGS(vcpu, fault, sptep, old_spte, ret),
TP_STRUCT__entry(
__field(int, vcpu_id)
@@ -268,8 +268,8 @@ TRACE_EVENT(
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->cr2_or_gpa = cr2_or_gpa;
- __entry->error_code = error_code;
+ __entry->cr2_or_gpa = fault->addr;
+ __entry->error_code = fault->error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
__entry->new_spte = *sptep;
@@ -367,8 +367,8 @@ TRACE_EVENT(
TRACE_EVENT(
kvm_mmu_spte_requested,
- TP_PROTO(gpa_t addr, int level, kvm_pfn_t pfn),
- TP_ARGS(addr, level, pfn),
+ TP_PROTO(struct kvm_page_fault *fault),
+ TP_ARGS(fault),
TP_STRUCT__entry(
__field(u64, gfn)
@@ -377,9 +377,9 @@ TRACE_EVENT(
),
TP_fast_assign(
- __entry->gfn = addr >> PAGE_SHIFT;
- __entry->pfn = pfn | (__entry->gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
- __entry->level = level;
+ __entry->gfn = fault->gfn;
+ __entry->pfn = fault->pfn | (fault->gfn & (KVM_PAGES_PER_HPAGE(fault->goal_level) - 1));
+ __entry->level = fault->goal_level;
),
TP_printk("gfn %llx pfn %llx level %d",
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 269f11f92fd0..cc4eb5b7fb76 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -19,6 +19,12 @@
#include "mmu.h"
#include "mmu_internal.h"
+bool kvm_page_track_write_tracking_enabled(struct kvm *kvm)
+{
+ return IS_ENABLED(CONFIG_KVM_EXTERNAL_WRITE_TRACKING) ||
+ !tdp_enabled || kvm_shadow_root_allocated(kvm);
+}
+
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
{
int i;
@@ -29,12 +35,17 @@ void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
}
}
-int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+int kvm_page_track_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages)
{
- int i;
+ int i;
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ if (i == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm))
+ continue;
+
slot->arch.gfn_track[i] =
kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
GFP_KERNEL_ACCOUNT);
@@ -57,6 +68,21 @@ static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
return true;
}
+int kvm_page_track_write_tracking_alloc(struct kvm_memory_slot *slot)
+{
+ unsigned short *gfn_track;
+
+ if (slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE])
+ return 0;
+
+ gfn_track = kvcalloc(slot->npages, sizeof(*gfn_track), GFP_KERNEL_ACCOUNT);
+ if (gfn_track == NULL)
+ return -ENOMEM;
+
+ slot->arch.gfn_track[KVM_PAGE_TRACK_WRITE] = gfn_track;
+ return 0;
+}
+
static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
enum kvm_page_track_mode mode, short count)
{
@@ -92,6 +118,10 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, 1);
/*
@@ -126,6 +156,10 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
if (WARN_ON(!page_track_mode_is_valid(mode)))
return;
+ if (WARN_ON(mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(kvm)))
+ return;
+
update_gfn_track(slot, gfn, mode, -1);
/*
@@ -139,19 +173,22 @@ EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
*/
-bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
- enum kvm_page_track_mode mode)
+bool kvm_slot_page_track_is_active(struct kvm_vcpu *vcpu,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode)
{
- struct kvm_memory_slot *slot;
int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot)
return false;
+ if (mode == KVM_PAGE_TRACK_WRITE &&
+ !kvm_page_track_write_tracking_enabled(vcpu->kvm))
+ return false;
+
index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
}
@@ -164,13 +201,13 @@ void kvm_page_track_cleanup(struct kvm *kvm)
cleanup_srcu_struct(&head->track_srcu);
}
-void kvm_page_track_init(struct kvm *kvm)
+int kvm_page_track_init(struct kvm *kvm)
{
struct kvm_page_track_notifier_head *head;
head = &kvm->arch.track_notifier_head;
- init_srcu_struct(&head->track_srcu);
INIT_HLIST_HEAD(&head->track_notifier_list);
+ return init_srcu_struct(&head->track_srcu);
}
/*
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 7d03e9b7ccfa..f87d36898c44 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -561,6 +561,7 @@ static bool
FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, pt_element_t gpte, bool no_dirty_log)
{
+ struct kvm_memory_slot *slot;
unsigned pte_access;
gfn_t gfn;
kvm_pfn_t pfn;
@@ -573,30 +574,21 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn,
no_dirty_log && (pte_access & ACC_WRITE_MASK));
- if (is_error_pfn(pfn))
+ if (!slot)
return false;
- /*
- * we call mmu_set_spte() with host_writable = true because
- * pte_prefetch_gfn_to_pfn always gets a writable pfn.
- */
- mmu_set_spte(vcpu, spte, pte_access, false, PG_LEVEL_4K, gfn, pfn,
- true, true);
+ pfn = gfn_to_pfn_memslot_atomic(slot, gfn);
+ if (is_error_pfn(pfn))
+ return false;
+ mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL);
kvm_release_pfn_clean(pfn);
return true;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte = *(const pt_element_t *)pte;
-
- FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
-}
-
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
struct guest_walker *gw, int level)
{
@@ -663,21 +655,16 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
- struct guest_walker *gw, u32 error_code,
- int max_level, kvm_pfn_t pfn, bool map_writable,
- bool prefault)
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ struct guest_walker *gw)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned int direct_access, access;
- int top_level, level, req_level, ret;
- gfn_t base_gfn = gw->gfn;
+ int top_level, ret;
+ gfn_t base_gfn = fault->gfn;
+ WARN_ON_ONCE(gw->gfn != base_gfn);
direct_access = gw->pte_access;
top_level = vcpu->arch.mmu->root_level;
@@ -695,7 +682,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, addr);
+ for (shadow_walk_init(&it, vcpu, fault->addr);
shadow_walk_okay(&it) && it.level > gw->level;
shadow_walk_next(&it)) {
gfn_t table_gfn;
@@ -707,8 +694,27 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
- sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access);
+ sp = kvm_mmu_get_page(vcpu, table_gfn, fault->addr,
+ it.level-1, false, access);
+ /*
+ * We must synchronize the pagetable before linking it
+ * because the guest doesn't need to flush tlb when
+ * the gpte is changed from non-present to present.
+ * Otherwise, the guest may use the wrong mapping.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_get_page() has already
+ * synchronized it transiently via kvm_sync_page().
+ *
+ * For higher level pagetable, we synchronize it via
+ * the slower mmu_sync_children(). If it needs to
+ * break, some progress has been made; return
+ * RET_PF_RETRY and retry on the next #PF.
+ * KVM_REQ_MMU_SYNC is not necessary but it
+ * expedites the process.
+ */
+ if (sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
}
/*
@@ -722,10 +728,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- level = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
clear_sp_write_flooding_count(it.sptep);
@@ -734,12 +739,11 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(*it.sptep, gw->gfn, it.level,
- &pfn, &level);
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
- if (it.level == level)
+ base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ if (it.level == fault->goal_level)
break;
validate_direct_spte(vcpu, it.sptep, direct_access);
@@ -747,16 +751,20 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
drop_large_spte(vcpu, it.sptep);
if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
+ sp = kvm_mmu_get_page(vcpu, base_gfn, fault->addr,
it.level - 1, true, direct_access);
link_shadow_page(vcpu, it.sptep, sp);
- if (huge_page_disallowed && req_level >= it.level)
+ if (fault->huge_page_disallowed &&
+ fault->req_level >= it.level)
account_huge_nx_page(vcpu->kvm, sp);
}
}
- ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, base_gfn, pfn, prefault, map_writable);
+ if (WARN_ON_ONCE(it.level != fault->goal_level))
+ return -EFAULT;
+
+ ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access,
+ base_gfn, fault->pfn, fault);
if (ret == RET_PF_SPURIOUS)
return ret;
@@ -822,45 +830,40 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
- bool prefault)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool write_fault = error_code & PFERR_WRITE_MASK;
- bool user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- kvm_pfn_t pfn;
- hva_t hva;
unsigned long mmu_seq;
- bool map_writable, is_self_change_mapping;
- int max_level;
+ bool is_self_change_mapping;
- pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
+ WARN_ON_ONCE(fault->is_tdp);
/*
+ * Look up the guest pte for the faulting address.
* If PFEC.RSVD is set, this is a shadow page fault.
* The bit needs to be cleared before walking guest page tables.
*/
- error_code &= ~PFERR_RSVD_MASK;
-
- /*
- * Look up the guest pte for the faulting address.
- */
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+ r = FNAME(walk_addr)(&walker, vcpu, fault->addr,
+ fault->error_code & ~PFERR_RSVD_MASK);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault)
+ if (!fault->prefetch)
kvm_inject_emulated_page_fault(vcpu, &walker.fault);
return RET_PF_RETRY;
}
- if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
- shadow_page_table_clear_flood(vcpu, addr);
+ fault->gfn = walker.gfn;
+ fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
+
+ if (page_fault_handle_page_track(vcpu, fault)) {
+ shadow_page_table_clear_flood(vcpu, fault->addr);
return RET_PF_EMULATE;
}
@@ -871,29 +874,28 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
vcpu->arch.write_fault_to_shadow_pgtable = false;
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
- &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
+ &walker, fault->user, &vcpu->arch.write_fault_to_shadow_pgtable);
if (is_self_change_mapping)
- max_level = PG_LEVEL_4K;
+ fault->max_level = PG_LEVEL_4K;
else
- max_level = walker.level;
+ fault->max_level = walker.level;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
- write_fault, &map_writable, &r))
+ if (kvm_faultin_pfn(vcpu, fault, &r))
return r;
- if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
+ if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r))
return r;
/*
* Do not change pte_access if the pfn is a mmio page, otherwise
* we will cache the incorrect access into mmio spte.
*/
- if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
- !is_cr0_wp(vcpu->arch.mmu) && !user_fault && !is_noslot_pfn(pfn)) {
+ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
+ !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) {
walker.pte_access |= ACC_WRITE_MASK;
walker.pte_access &= ~ACC_USER_MASK;
@@ -909,20 +911,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (!is_noslot_pfn(pfn) && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, hva))
+ if (fault->slot && mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
r = make_mmu_pages_available(vcpu);
if (r)
goto out_unlock;
- r = FNAME(fetch)(vcpu, addr, &walker, error_code, max_level, pfn,
- map_writable, prefault);
+ r = FNAME(fetch)(vcpu, fault, &walker);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
write_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
+ kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -988,10 +989,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
sizeof(pt_element_t)))
break;
- FNAME(update_pte)(vcpu, sp, sptep, &gpte);
+ FNAME(prefetch_gpte)(vcpu, sp, sptep, gpte, false);
}
- if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
+ if (!sp->unsync_children)
break;
}
write_unlock(&vcpu->kvm->mmu_lock);
@@ -1048,21 +1049,18 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*
- * Note:
- * We should flush all tlbs if spte is dropped even though guest is
- * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
- * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
- * used by guest then tlbs are not flushed, so guest is allowed to access the
- * freed pages.
- * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
+ * Returns
+ * < 0: the sp should be zapped
+ * 0: the sp is synced and no tlb flushing is required
+ * > 0: the sp is synced and tlb flushing is required
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
- int i, nr_present = 0;
+ int i;
bool host_writable;
gpa_t first_pte_gpa;
- int set_spte_ret = 0;
+ bool flush = false;
/*
* Ignore various flags when verifying that it's safe to sync a shadow
@@ -1087,11 +1085,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
*/
if (WARN_ON_ONCE(sp->role.direct ||
(sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
- return 0;
+ return -1;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
+ u64 *sptep, spte;
+ struct kvm_memory_slot *slot;
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
@@ -1104,16 +1104,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
sizeof(pt_element_t)))
- return 0;
+ return -1;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- /*
- * Update spte before increasing tlbs_dirty to make
- * sure no tlb flush is lost after spte is zapped; see
- * the comments in kvm_flush_remote_tlbs().
- */
- smp_wmb();
- vcpu->kvm->tlbs_dirty++;
+ flush = true;
continue;
}
@@ -1122,35 +1116,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= FNAME(gpte_access)(gpte);
FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
- if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
- &nr_present))
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
continue;
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
- /*
- * The same as above where we are doing
- * prefetch_invalid_gpte().
- */
- smp_wmb();
- vcpu->kvm->tlbs_dirty++;
+ flush = true;
continue;
}
- nr_present++;
-
- host_writable = sp->spt[i] & shadow_host_writable_mask;
+ sptep = &sp->spt[i];
+ spte = *sptep;
+ host_writable = spte & shadow_host_writable_mask;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ make_spte(vcpu, sp, slot, pte_access, gfn,
+ spte_to_pfn(spte), spte, true, false,
+ host_writable, &spte);
- set_spte_ret |= set_spte(vcpu, &sp->spt[i],
- pte_access, PG_LEVEL_4K,
- gfn, spte_to_pfn(sp->spt[i]),
- true, false, host_writable);
+ flush |= mmu_spte_update(sptep, spte);
}
- if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- return nr_present;
+ return flush;
}
#undef pt_element_t
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 3e97cdb13eb7..0c76c45fdb68 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -89,15 +89,17 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte)
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte)
{
+ int level = sp->role.level;
u64 spte = SPTE_MMU_PRESENT_MASK;
- int ret = 0;
+ bool wrprot = false;
- if (ad_disabled)
+ if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED_MASK;
else if (kvm_vcpu_ad_need_write_protect(vcpu))
spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
@@ -109,7 +111,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* read access. See FNAME(gpte_access) in paging_tmpl.h.
*/
spte |= shadow_present_mask;
- if (!speculative)
+ if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
@@ -150,7 +152,7 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots.
* Same reasoning can be applied to dirty page accounting.
*/
- if (!can_unsync && is_writable_pte(old_spte))
+ if (is_writable_pte(old_spte))
goto out;
/*
@@ -159,10 +161,10 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
* e.g. it's write-tracked (upper-level SPs) or has one or more
* shadow pages and unsync'ing pages is not allowed.
*/
- if (mmu_try_to_unsync_pages(vcpu, gfn, can_unsync)) {
+ if (mmu_try_to_unsync_pages(vcpu, slot, gfn, can_unsync, prefetch)) {
pgprintk("%s: found shadow page for %llx, marking ro\n",
__func__, gfn);
- ret |= SET_SPTE_WRITE_PROTECTED_PT;
+ wrprot = true;
pte_access &= ~ACC_WRITE_MASK;
spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
}
@@ -171,16 +173,22 @@ int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
if (pte_access & ACC_WRITE_MASK)
spte |= spte_shadow_dirty_mask(spte);
- if (speculative)
+out:
+ if (prefetch)
spte = mark_spte_for_access_track(spte);
-out:
WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
"spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+ if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON(level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(vcpu->kvm, slot, gfn);
+ }
+
*new_spte = spte;
- return ret;
+ return wrprot;
}
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index eb7b227fc6cf..cc432f9a966b 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -310,12 +310,7 @@ static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
u64 spte, int level)
{
- /*
- * Use a bitwise-OR instead of a logical-OR to aggregate the reserved
- * bits and EPT's invalid memtype/XWR checks to avoid an extra Jcc
- * (this is extremely unlikely to be short-circuited as true).
- */
- return __is_bad_mt_xwr(rsvd_check, spte) |
+ return __is_bad_mt_xwr(rsvd_check, spte) ||
__is_rsvd_bits_set(rsvd_check, spte, level);
}
@@ -334,15 +329,11 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
-/* Bits which may be returned by set_spte() */
-#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
-#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
-#define SET_SPTE_SPURIOUS BIT(2)
-
-int make_spte(struct kvm_vcpu *vcpu, unsigned int pte_access, int level,
- gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool speculative,
- bool can_unsync, bool host_writable, bool ad_disabled,
- u64 *new_spte);
+bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct kvm_memory_slot *slot,
+ unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
+ u64 old_spte, bool prefetch, bool can_unsync,
+ bool host_writable, u64 *new_spte);
u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled);
u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access);
u64 mark_spte_for_access_track(u64 spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 64ccfc1fa553..7c5dd83e52de 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -167,6 +167,7 @@ static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
role.direct = true;
role.gpte_is_8_bytes = true;
role.access = ACC_ALL;
+ role.ad_disabled = !shadow_accessed_mask;
return role;
}
@@ -489,8 +490,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
}
/*
- * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
- * and handle the associated bookkeeping, but do not mark the page dirty
+ * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
+ * and handle the associated bookkeeping. Do not mark the page dirty
* in KVM's dirty bitmaps.
*
* @kvm: kvm instance
@@ -499,9 +500,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* Returns: true if the SPTE was set, false if it was not. If false is returned,
* this function will have no side-effects.
*/
-static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
lockdep_assert_held_read(&kvm->mmu_lock);
@@ -527,43 +528,6 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
return true;
}
-/*
- * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
- * TDP page fault.
- *
- * @vcpu: The vcpu instance that took the TDP page fault.
- * @iter: a tdp_iter instance currently on the SPTE that should be set
- * @new_spte: The value the SPTE should be set to
- *
- * Returns: true if the SPTE was set, false if it was not. If false is returned,
- * this function will have no side-effects.
- */
-static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
- struct tdp_iter *iter,
- u64 new_spte)
-{
- struct kvm *kvm = vcpu->kvm;
-
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
- return false;
-
- /*
- * Use kvm_vcpu_gfn_to_memslot() instead of going through
- * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
- */
- if (is_writable_pte(new_spte)) {
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
-
- if (slot && kvm_slot_dirty_track_enabled(slot)) {
- /* Enforced by kvm_mmu_hugepage_adjust. */
- WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
- mark_page_dirty_in_slot(kvm, slot, iter->gfn);
- }
- }
-
- return true;
-}
-
static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
struct tdp_iter *iter)
{
@@ -573,7 +537,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
+ if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
return false;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -929,26 +893,26 @@ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
* Installs a last-level SPTE to handle a TDP page fault.
* (NPT/EPT violation/misconfiguration)
*/
-static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
- int map_writable,
- struct tdp_iter *iter,
- kvm_pfn_t pfn, bool prefault)
+static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ struct tdp_iter *iter)
{
+ struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep);
u64 new_spte;
int ret = RET_PF_FIXED;
- int make_spte_ret = 0;
+ bool wrprot = false;
- if (unlikely(is_noslot_pfn(pfn)))
+ WARN_ON(sp->role.level != fault->goal_level);
+ if (unlikely(!fault->slot))
new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
else
- make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
- pfn, iter->old_spte, prefault, true,
- map_writable, !shadow_accessed_mask,
- &new_spte);
+ wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
+ fault->pfn, iter->old_spte, fault->prefetch, true,
+ fault->map_writable, &new_spte);
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
+ else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
return RET_PF_RETRY;
/*
@@ -956,10 +920,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
- if (write)
+ if (wrprot) {
+ if (fault->write)
ret = RET_PF_EMULATE;
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
@@ -986,37 +949,26 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
* Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
* page tables and SPTEs to translate the faulting guest physical address.
*/
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault)
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
- bool write = error_code & PFERR_WRITE_MASK;
- bool exec = error_code & PFERR_FETCH_MASK;
- bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu *mmu = vcpu->arch.mmu;
struct tdp_iter iter;
struct kvm_mmu_page *sp;
u64 *child_pt;
u64 new_spte;
int ret;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- int level;
- int req_level;
- level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
- huge_page_disallowed, &req_level);
+ kvm_mmu_hugepage_adjust(vcpu, fault);
- trace_kvm_mmu_spte_requested(gpa, level, pfn);
+ trace_kvm_mmu_spte_requested(fault);
rcu_read_lock();
- tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
- if (nx_huge_page_workaround_enabled)
- disallowed_hugepage_adjust(iter.old_spte, gfn,
- iter.level, &pfn, &level);
+ tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
+ if (fault->nx_huge_page_workaround_enabled)
+ disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
- if (iter.level == level)
+ if (iter.level == fault->goal_level)
break;
/*
@@ -1052,10 +1004,10 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
+ if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
tdp_mmu_link_page(vcpu->kvm, sp,
- huge_page_disallowed &&
- req_level >= iter.level);
+ fault->huge_page_disallowed &&
+ fault->req_level >= iter.level);
trace_kvm_mmu_get_page(sp, true);
} else {
@@ -1065,13 +1017,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
}
}
- if (iter.level != level) {
+ if (iter.level != fault->goal_level) {
rcu_read_unlock();
return RET_PF_RETRY;
}
- ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
- pfn, prefault);
+ ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
rcu_read_unlock();
return ret;
@@ -1241,8 +1192,7 @@ retry:
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
- new_spte)) {
+ if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
/*
* The iter must explicitly re-read the SPTE because
* the atomic cmpxchg failed.
@@ -1310,8 +1260,7 @@ retry:
continue;
}
- if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
- new_spte)) {
+ if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
/*
* The iter must explicitly re-read the SPTE because
* the atomic cmpxchg failed.
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 358f447d4012..476b133544dd 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -48,9 +48,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm);
void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm);
void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm);
-int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
- int map_writable, int max_level, kvm_pfn_t pfn,
- bool prefault);
+int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
@@ -92,7 +90,6 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
#ifdef CONFIG_X86_64
bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
@@ -114,7 +111,6 @@ static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
#else
static inline bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return false; }
static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
#endif
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 2545d0c61985..f8b7bc04b3e7 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -238,6 +238,18 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
}
+static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
+{
+ /* Nested FLUSHBYASID is not supported yet. */
+ switch(tlb_ctl) {
+ case TLB_CONTROL_DO_NOTHING:
+ case TLB_CONTROL_FLUSH_ALL_ASID:
+ return true;
+ default:
+ return false;
+ }
+}
+
static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
struct vmcb_control_area *control)
{
@@ -257,6 +269,9 @@ static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
IOPM_SIZE)))
return false;
+ if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
+ return false;
+
return true;
}
@@ -538,22 +553,27 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
if (nested_npt_enabled(svm))
nested_svm_init_mmu_context(vcpu);
- svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset =
- vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset;
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ vcpu->arch.l1_tsc_offset,
+ svm->nested.ctl.tsc_offset,
+ svm->tsc_ratio_msr);
+
+ svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ nested_svm_update_tsc_ratio_msr(vcpu);
+ }
svm->vmcb->control.int_ctl =
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
(svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
- svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext;
svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
svm->vmcb->control.int_state = svm->nested.ctl.int_state;
svm->vmcb->control.event_inj = svm->nested.ctl.event_inj;
svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
- svm->vmcb->control.pause_filter_count = svm->nested.ctl.pause_filter_count;
- svm->vmcb->control.pause_filter_thresh = svm->nested.ctl.pause_filter_thresh;
-
nested_svm_transition_tlb_flush(vcpu);
/* Enter Guest-Mode */
@@ -579,7 +599,7 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to
}
int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
- struct vmcb *vmcb12)
+ struct vmcb *vmcb12, bool from_vmrun)
{
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
@@ -609,13 +629,16 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
nested_vmcb02_prepare_save(svm, vmcb12);
ret = nested_svm_load_cr3(&svm->vcpu, vmcb12->save.cr3,
- nested_npt_enabled(svm), true);
+ nested_npt_enabled(svm), from_vmrun);
if (ret)
return ret;
if (!npt_enabled)
vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
+ if (!from_vmrun)
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
svm_set_gif(svm, true);
return 0;
@@ -681,7 +704,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
svm->nested.nested_run_pending = 1;
- if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12))
+ if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
goto out_exit_err;
if (nested_svm_vmrun_msrpm(svm))
@@ -808,11 +831,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- vmcb12->control.pause_filter_count =
- svm->vmcb->control.pause_filter_count;
- vmcb12->control.pause_filter_thresh =
- svm->vmcb->control.pause_filter_thresh;
-
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
@@ -830,6 +848,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
+ if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
+ WARN_ON(!svm->tsc_scaling_enabled);
+ vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ }
+
svm->nested.ctl.nested_cr3 = 0;
/*
@@ -1217,6 +1241,16 @@ int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_CONTINUE;
}
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ vcpu->arch.tsc_scaling_ratio =
+ kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
+ svm->tsc_ratio_msr);
+ svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+}
+
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
u32 user_data_size)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 75e0b21ad07c..1964b9a174be 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -17,10 +17,10 @@
#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
-#include <asm/fpu/internal.h>
#include <asm/pkru.h>
#include <asm/trapnr.h>
+#include <asm/fpu/xcr.h>
#include "x86.h"
#include "svm.h"
@@ -595,43 +595,55 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
return 0;
}
-static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
+ int *error)
{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_update_vmsa vmsa;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ /* Perform some pre-encryption checks against the VMSA */
+ ret = sev_es_sync_vmsa(svm);
+ if (ret)
+ return ret;
+
+ /*
+ * The LAUNCH_UPDATE_VMSA command will perform in-place encryption of
+ * the VMSA memory content (i.e it will write the same memory region
+ * with the guest's key), so invalidate it first.
+ */
+ clflush_cache_range(svm->vmsa, PAGE_SIZE);
+
+ vmsa.reserved = 0;
+ vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.address = __sme_pa(svm->vmsa);
+ vmsa.len = PAGE_SIZE;
+ ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
+ if (ret)
+ return ret;
+
+ vcpu->arch.guest_state_protected = true;
+ return 0;
+}
+
+static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
struct kvm_vcpu *vcpu;
int i, ret;
if (!sev_es_guest(kvm))
return -ENOTTY;
- vmsa.reserved = 0;
-
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct vcpu_svm *svm = to_svm(vcpu);
-
- /* Perform some pre-encryption checks against the VMSA */
- ret = sev_es_sync_vmsa(svm);
+ ret = mutex_lock_killable(&vcpu->mutex);
if (ret)
return ret;
- /*
- * The LAUNCH_UPDATE_VMSA command will perform in-place
- * encryption of the VMSA memory content (i.e it will write
- * the same memory region with the guest's key), so invalidate
- * it first.
- */
- clflush_cache_range(svm->vmsa, PAGE_SIZE);
+ ret = __sev_launch_update_vmsa(kvm, vcpu, &argp->error);
- vmsa.handle = sev->handle;
- vmsa.address = __sme_pa(svm->vmsa);
- vmsa.len = PAGE_SIZE;
- ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa,
- &argp->error);
+ mutex_unlock(&vcpu->mutex);
if (ret)
return ret;
-
- svm->vcpu.arch.guest_state_protected = true;
}
return 0;
@@ -1397,8 +1409,10 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Bind ASID to this guest */
ret = sev_bind_asid(kvm, start.handle, error);
- if (ret)
+ if (ret) {
+ sev_decommission(start.handle);
goto e_free_session;
+ }
params.handle = start.handle;
if (copy_to_user((void __user *)(uintptr_t)argp->data,
@@ -1464,12 +1478,19 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Pin guest memory */
guest_page = sev_pin_memory(kvm, params.guest_uaddr & PAGE_MASK,
- PAGE_SIZE, &n, 0);
+ PAGE_SIZE, &n, 1);
if (IS_ERR(guest_page)) {
ret = PTR_ERR(guest_page);
goto e_free_trans;
}
+ /*
+ * Flush (on non-coherent CPUs) before RECEIVE_UPDATE_DATA, the PSP
+ * encrypts the written data with the guest's key, and the cache may
+ * contain dirty, unencrypted data.
+ */
+ sev_clflush_pages(guest_page, n);
+
/* The RECEIVE_UPDATE_DATA command requires C-bit to be always set. */
data.guest_address = (page_to_pfn(guest_page[0]) << PAGE_SHIFT) + offset;
data.guest_address |= sev_me_mask;
@@ -1501,6 +1522,20 @@ static int sev_receive_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
return sev_issue_cmd(kvm, SEV_CMD_RECEIVE_FINISH, &data, &argp->error);
}
+static bool cmd_allowed_from_miror(u32 cmd_id)
+{
+ /*
+ * Allow mirrors VM to call KVM_SEV_LAUNCH_UPDATE_VMSA to enable SEV-ES
+ * active mirror VMs. Also allow the debugging and status commands.
+ */
+ if (cmd_id == KVM_SEV_LAUNCH_UPDATE_VMSA ||
+ cmd_id == KVM_SEV_GUEST_STATUS || cmd_id == KVM_SEV_DBG_DECRYPT ||
+ cmd_id == KVM_SEV_DBG_ENCRYPT)
+ return true;
+
+ return false;
+}
+
int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1517,8 +1552,9 @@ int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
mutex_lock(&kvm->lock);
- /* enc_context_owner handles all memory enc operations */
- if (is_mirroring_enc_context(kvm)) {
+ /* Only the enc_context_owner handles some memory enc operations. */
+ if (is_mirroring_enc_context(kvm) &&
+ !cmd_allowed_from_miror(sev_cmd.id)) {
r = -EINVAL;
goto out;
}
@@ -1715,8 +1751,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
- struct kvm_sev_info *mirror_sev;
- unsigned int asid;
+ struct kvm_sev_info source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
@@ -1739,7 +1774,8 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
goto e_source_unlock;
}
- asid = to_kvm_svm(source_kvm)->sev_info.asid;
+ memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
+ sizeof(source_sev));
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
@@ -1759,8 +1795,16 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
- mirror_sev->asid = asid;
mirror_sev->active = true;
+ mirror_sev->asid = source_sev.asid;
+ mirror_sev->fd = source_sev.fd;
+ mirror_sev->es_active = source_sev.es_active;
+ mirror_sev->handle = source_sev.handle;
+ /*
+ * Do not copy ap_jump_table. Since the mirror does not share the same
+ * KVM contexts as the original, and they may have different
+ * memory-views.
+ */
mutex_unlock(&kvm->lock);
return 0;
@@ -2547,11 +2591,20 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in)
{
- if (!setup_vmgexit_scratch(svm, in, svm->vmcb->control.exit_info_2))
+ int count;
+ int bytes;
+
+ if (svm->vmcb->control.exit_info_2 > INT_MAX)
+ return -EINVAL;
+
+ count = svm->vmcb->control.exit_info_2;
+ if (unlikely(check_mul_overflow(count, size, &bytes)))
+ return -EINVAL;
+
+ if (!setup_vmgexit_scratch(svm, in, bytes))
return -EINVAL;
- return kvm_sev_es_string_io(&svm->vcpu, size, port,
- svm->ghcb_sa, svm->ghcb_sa_len, in);
+ return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in);
}
void sev_es_init_vmcb(struct vcpu_svm *svm)
@@ -2599,11 +2652,11 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
-void sev_es_create_vcpu(struct vcpu_svm *svm)
+void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
/*
- * Set the GHCB MSR value as per the GHCB specification when creating
- * a vCPU for an SEV-ES guest.
+ * Set the GHCB MSR value as per the GHCB specification when emulating
+ * vCPU RESET for an SEV-ES guest.
*/
set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
GHCB_VERSION_MIN,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 05e8d4d27969..b36ca4e476c2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -25,6 +25,7 @@
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/rwsem.h>
+#include <linux/cc_platform.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -36,6 +37,7 @@
#include <asm/spec-ctrl.h>
#include <asm/cpu_device_id.h>
#include <asm/traps.h>
+#include <asm/fpu/api.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -186,6 +188,13 @@ module_param(vls, int, 0444);
static int vgif = true;
module_param(vgif, int, 0444);
+/* enable/disable LBR virtualization */
+static int lbrv = true;
+module_param(lbrv, int, 0444);
+
+static int tsc_scaling = true;
+module_param(tsc_scaling, int, 0444);
+
/*
* enable / disable AVIC. Because the defaults differ for APICv
* support between VMX and SVM we cannot use module_param_named.
@@ -455,7 +464,7 @@ static int has_svm(void)
return 0;
}
- if (sev_active()) {
+ if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
return 0;
}
@@ -466,7 +475,7 @@ static int has_svm(void)
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
+ if (tsc_scaling)
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -509,6 +518,10 @@ static int svm_hardware_enable(void)
wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area));
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ /*
+ * Set the default value, even if we don't use TSC scaling
+ * to avoid having stale value in the msr
+ */
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
}
@@ -929,6 +942,9 @@ static __init void svm_set_cpu_caps(void)
if (npt_enabled)
kvm_cpu_cap_set(X86_FEATURE_NPT);
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -976,10 +992,15 @@ static __init int svm_hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
- if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
}
tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
@@ -1059,6 +1080,13 @@ static __init int svm_hardware_setup(void)
pr_info("Virtual GIF supported\n");
}
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
svm_set_cpu_caps();
/*
@@ -1109,7 +1137,9 @@ static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
{
- return kvm_default_tsc_scaling_ratio;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->tsc_ratio_msr;
}
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1121,7 +1151,7 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
{
wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
}
@@ -1150,6 +1180,38 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
}
}
+static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (guest_cpuid_is_intel(vcpu)) {
+ /*
+ * We must intercept SYSENTER_EIP and SYSENTER_ESP
+ * accesses because the processor only stores 32 bits.
+ * For the same reason we cannot use virtual VMLOAD/VMSAVE.
+ */
+ svm_set_intercept(svm, INTERCEPT_VMLOAD);
+ svm_set_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+ } else {
+ /*
+ * If hardware supports Virtual VMLOAD VMSAVE then enable it
+ * in VMCB and clear intercepts to avoid #VMEXIT.
+ */
+ if (vls) {
+ svm_clr_intercept(svm, INTERCEPT_VMLOAD);
+ svm_clr_intercept(svm, INTERCEPT_VMSAVE);
+ svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ }
+ /* No need to intercept these MSRs */
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+ set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ }
+}
+
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1296,11 +1358,25 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
}
svm_hv_init_vmcb(svm->vmcb);
+ init_vmcb_after_set_cpuid(vcpu);
vmcb_mark_all_dirty(svm->vmcb);
enable_gif(svm);
+}
+
+static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ svm_vcpu_init_msrpm(vcpu, svm->msrpm);
+
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
+ svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio;
+
+ if (sev_es_guest(vcpu->kvm))
+ sev_es_vcpu_reset(svm);
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -1311,6 +1387,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
svm->virt_spec_ctrl = 0;
init_vmcb(vcpu);
+
+ if (!init_event)
+ __svm_vcpu_reset(vcpu);
}
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1346,10 +1425,10 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
/*
* SEV-ES guests maintain an encrypted version of their FPU
* state which is restored and saved on VMRUN and VMEXIT.
- * Free the fpu structure to prevent KVM from attempting to
- * access the FPU state.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
*/
- kvm_free_guest_fpu(vcpu);
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
@@ -1370,24 +1449,13 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
+ svm_switch_vmcb(svm, &svm->vmcb01);
if (vmsa_page)
svm->vmsa = page_address(vmsa_page);
svm->guest_state_loaded = false;
- svm_switch_vmcb(svm, &svm->vmcb01);
- init_vmcb(vcpu);
-
- svm_vcpu_init_msrpm(vcpu, svm->msrpm);
-
- svm_init_osvw(vcpu);
- vcpu->arch.microcode_version = 0x01000065;
-
- if (sev_es_guest(vcpu->kvm))
- /* Perform SEV-ES specific VMCB creation updates */
- sev_es_create_vcpu(svm);
-
return 0;
error_free_vmsa_page:
@@ -1447,7 +1515,7 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
vmsave(__sme_page_pa(sd->save_area));
}
- if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ if (tsc_scaling) {
u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
__this_cpu_write(current_tsc_ratio, tsc_ratio);
@@ -1566,6 +1634,8 @@ static void svm_clear_vintr(struct vcpu_svm *svm)
svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl &
V_IRQ_INJECTION_BITS_MASK;
+
+ svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
}
vmcb_mark_dirty(svm->vmcb, VMCB_INTR);
@@ -2222,6 +2292,10 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (error_code)
goto reinject;
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
/* Decode the instruction for usage later */
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
goto reinject;
@@ -2651,6 +2725,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
struct vcpu_svm *svm = to_svm(vcpu);
switch (msr_info->index) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr_info->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+ msr_info->data = svm->tsc_ratio_msr;
+ break;
case MSR_STAR:
msr_info->data = svm->vmcb01.ptr->save.star;
break;
@@ -2800,6 +2879,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_AMD64_TSC_RATIO:
+ if (!msr->host_initiated && !svm->tsc_scaling_enabled)
+ return 1;
+
+ if (data & TSC_RATIO_RSVD)
+ return 1;
+
+ svm->tsc_ratio_msr = data;
+
+ if (svm->tsc_scaling_enabled && is_guest_mode(vcpu))
+ nested_svm_update_tsc_ratio_msr(vcpu);
+
+ break;
case MSR_IA32_CR_PAT:
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
@@ -2912,7 +3004,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->tsc_aux = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!boot_cpu_has(X86_FEATURE_LBRV)) {
+ if (!lbrv) {
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -3272,11 +3364,13 @@ int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
return svm_exit_handlers[exit_code](vcpu);
}
-static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+ *reason = control->exit_code;
*info1 = control->exit_info_1;
*info2 = control->exit_info_2;
*intr_info = control->exit_int_info;
@@ -3293,7 +3387,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_exit(vcpu, KVM_ISA_SVM);
/* SEV-ES guests must use the CR write traps to track CR registers. */
if (!sev_es_guest(vcpu->kvm)) {
@@ -3306,7 +3400,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
if (is_guest_mode(vcpu)) {
int vmexit;
- trace_kvm_nested_vmexit(exit_code, vcpu, KVM_ISA_SVM);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
@@ -3774,8 +3868,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
pre_svm_run(vcpu);
- WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
-
sync_lapic_to_cr8(vcpu);
if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -3995,6 +4087,8 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) &&
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
+ svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+
svm_recalc_instruction_intercepts(vcpu, svm);
/* For sev guests, the memory encryption bit is not reserved in CR3. */
@@ -4021,33 +4115,7 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_request_apicv_update(vcpu->kvm, false,
APICV_INHIBIT_REASON_NESTED);
}
-
- if (guest_cpuid_is_intel(vcpu)) {
- /*
- * We must intercept SYSENTER_EIP and SYSENTER_ESP
- * accesses because the processor only stores 32 bits.
- * For the same reason we cannot use virtual VMLOAD/VMSAVE.
- */
- svm_set_intercept(svm, INTERCEPT_VMLOAD);
- svm_set_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
-
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
- } else {
- /*
- * If hardware supports Virtual VMLOAD VMSAVE then enable it
- * in VMCB and clear intercepts to avoid #VMEXIT.
- */
- if (vls) {
- svm_clr_intercept(svm, INTERCEPT_VMLOAD);
- svm_clr_intercept(svm, INTERCEPT_VMSAVE);
- svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
- }
- /* No need to intercept these MSRs */
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
- set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
- }
+ init_vmcb_after_set_cpuid(vcpu);
}
static bool svm_has_wbinvd_exit(void)
@@ -4285,43 +4353,44 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
struct kvm_host_map map_save;
int ret;
- if (is_guest_mode(vcpu)) {
- /* FED8h - SVM Guest */
- put_smstate(u64, smstate, 0x7ed8, 1);
- /* FEE0h - SVM Guest VMCB Physical Address */
- put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
+ if (!is_guest_mode(vcpu))
+ return 0;
- svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
- svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
- svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa);
- ret = nested_svm_vmexit(svm);
- if (ret)
- return ret;
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
- /*
- * KVM uses VMCB01 to store L1 host state while L2 runs but
- * VMCB01 is going to be used during SMM and thus the state will
- * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
- * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
- * format of the area is identical to guest save area offsetted
- * by 0x400 (matches the offset of 'struct vmcb_save_area'
- * within 'struct vmcb'). Note: HSAVE area may also be used by
- * L1 hypervisor to save additional host context (e.g. KVM does
- * that, see svm_prepare_guest_switch()) which must be
- * preserved.
- */
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
- &map_save) == -EINVAL)
- return 1;
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
- BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
+ /*
+ * KVM uses VMCB01 to store L1 host state while L2 runs but
+ * VMCB01 is going to be used during SMM and thus the state will
+ * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save
+ * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the
+ * format of the area is identical to guest save area offsetted
+ * by 0x400 (matches the offset of 'struct vmcb_save_area'
+ * within 'struct vmcb'). Note: HSAVE area may also be used by
+ * L1 hypervisor to save additional host context (e.g. KVM does
+ * that, see svm_prepare_guest_switch()) which must be
+ * preserved.
+ */
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
+ &map_save) == -EINVAL)
+ return 1;
- svm_copy_vmrun_state(map_save.hva + 0x400,
- &svm->vmcb01.ptr->save);
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400);
- kvm_vcpu_unmap(vcpu, &map_save, true);
- }
+ svm_copy_vmrun_state(map_save.hva + 0x400,
+ &svm->vmcb01.ptr->save);
+
+ kvm_vcpu_unmap(vcpu, &map_save, true);
return 0;
}
@@ -4329,50 +4398,54 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_host_map map, map_save;
- int ret = 0;
+ u64 saved_efer, vmcb12_gpa;
+ struct vmcb *vmcb12;
+ int ret;
- if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) {
- u64 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
- u64 guest = GET_SMSTATE(u64, smstate, 0x7ed8);
- u64 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
- struct vmcb *vmcb12;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return 0;
- if (guest) {
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- return 1;
+ /* Non-zero if SMI arrived while vCPU was in guest mode. */
+ if (!GET_SMSTATE(u64, smstate, 0x7ed8))
+ return 0;
- if (!(saved_efer & EFER_SVME))
- return 1;
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return 1;
- if (kvm_vcpu_map(vcpu,
- gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
- return 1;
+ saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0);
+ if (!(saved_efer & EFER_SVME))
+ return 1;
- if (svm_allocate_nested(svm))
- return 1;
+ vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0);
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL)
+ return 1;
- vmcb12 = map.hva;
+ ret = 1;
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL)
+ goto unmap_map;
- nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ if (svm_allocate_nested(svm))
+ goto unmap_save;
- ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12);
- kvm_vcpu_unmap(vcpu, &map, true);
+ /*
+ * Restore L1 host state from L1 HSAVE area as VMCB01 was
+ * used during SMM (see svm_enter_smm())
+ */
- /*
- * Restore L1 host state from L1 HSAVE area as VMCB01 was
- * used during SMM (see svm_enter_smm())
- */
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr),
- &map_save) == -EINVAL)
- return 1;
+ svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400);
- svm_copy_vmrun_state(&svm->vmcb01.ptr->save,
- map_save.hva + 0x400);
+ /*
+ * Enter the nested guest now
+ */
- kvm_vcpu_unmap(vcpu, &map_save, true);
- }
- }
+ vmcb12 = map.hva;
+ nested_load_control_from_vmcb12(svm, &vmcb12->control);
+ ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false);
+unmap_save:
+ kvm_vcpu_unmap(vcpu, &map_save, true);
+unmap_map:
+ kvm_vcpu_unmap(vcpu, &map, true);
return ret;
}
@@ -4509,6 +4582,8 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
+ .name = "kvm_amd",
+
.hardware_unsetup = svm_hardware_teardown,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 524d943f3efc..5e9510d4574e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -140,6 +140,8 @@ struct vcpu_svm {
u64 next_rip;
u64 spec_ctrl;
+
+ u64 tsc_ratio_msr;
/*
* Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
* translated into the appropriate L2_CFG bits on the host to
@@ -160,7 +162,8 @@ struct vcpu_svm {
unsigned long int3_rip;
/* cached guest cpuid flags for faster access */
- bool nrips_enabled : 1;
+ bool nrips_enabled : 1;
+ bool tsc_scaling_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -191,7 +194,7 @@ struct vcpu_svm {
/* SEV-ES scratch area support */
void *ghcb_sa;
- u64 ghcb_sa_len;
+ u32 ghcb_sa_len;
bool ghcb_sa_sync;
bool ghcb_sa_free;
@@ -218,12 +221,12 @@ DECLARE_PER_CPU(struct svm_cpu_data *, svm_data);
void recalc_intercepts(struct vcpu_svm *svm);
-static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
+static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
{
return container_of(kvm, struct kvm_svm, kvm);
}
-static inline bool sev_guest(struct kvm *kvm)
+static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -234,7 +237,7 @@ static inline bool sev_guest(struct kvm *kvm)
#endif
}
-static inline bool sev_es_guest(struct kvm *kvm)
+static __always_inline bool sev_es_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
@@ -271,7 +274,7 @@ static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit)
return !test_bit(bit, (unsigned long *)&vmcb->control.clean);
}
-static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
+static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
@@ -459,7 +462,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
return vmcb_is_intercept(&svm->nested.ctl, INTERCEPT_NMI);
}
-int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, struct vmcb *vmcb12);
+int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
+ u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
void svm_leave_nested(struct vcpu_svm *svm);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
@@ -482,6 +486,8 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu);
int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
+void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
+void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
void nested_load_control_from_vmcb12(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm);
@@ -561,7 +567,7 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu);
int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
void sev_es_init_vmcb(struct vcpu_svm *svm);
-void sev_es_create_vcpu(struct vcpu_svm *svm);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 22e2b019de37..9430d6437c9f 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -56,12 +56,12 @@ static inline void invlpga(unsigned long addr, u32 asid)
* VMSAVE, VMLOAD, etc... is still controlled by the effective address size,
* hence 'unsigned long' instead of 'hpa_t'.
*/
-static inline void vmsave(unsigned long pa)
+static __always_inline void vmsave(unsigned long pa)
{
svm_asm1(vmsave, "a" (pa), "memory");
}
-static inline void vmload(unsigned long pa)
+static __always_inline void vmload(unsigned long pa)
{
svm_asm1(vmload, "a" (pa), "memory");
}
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 03ebe368333e..953b0fcb21ee 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -288,8 +288,8 @@ TRACE_EVENT(kvm_apic,
#define TRACE_EVENT_KVM_EXIT(name) \
TRACE_EVENT(name, \
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa), \
- TP_ARGS(exit_reason, vcpu, isa), \
+ TP_PROTO(struct kvm_vcpu *vcpu, u32 isa), \
+ TP_ARGS(vcpu, isa), \
\
TP_STRUCT__entry( \
__field( unsigned int, exit_reason ) \
@@ -303,11 +303,12 @@ TRACE_EVENT(name, \
), \
\
TP_fast_assign( \
- __entry->exit_reason = exit_reason; \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
- static_call(kvm_x86_get_exit_info)(vcpu, &__entry->info1, \
+ static_call(kvm_x86_get_exit_info)(vcpu, \
+ &__entry->exit_reason, \
+ &__entry->info1, \
&__entry->info2, \
&__entry->intr_info, \
&__entry->error_code); \
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 0dab1b7b529f..ba6f99f584ac 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -353,14 +353,20 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ break;
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC;
break;
}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 152ab0aa82cf..16731d2cf231 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -93,7 +93,7 @@ static __always_inline int get_evmcs_offset(unsigned long field,
return evmcs_field->offset;
}
-static inline void evmcs_write64(unsigned long field, u64 value)
+static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -183,7 +183,7 @@ static inline void evmcs_load(u64 phys_addr)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
-static inline void evmcs_write64(unsigned long field, u64 value) {}
+static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
static inline void evmcs_write16(unsigned long field, u16 value) {}
static inline u64 evmcs_read64(unsigned long field) { return 0; }
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ccb03d69546c..b4ee5e9f9e20 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -191,7 +191,7 @@ static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
* failValid writes the error number to the current VMCS, which
* can't be done if there isn't a current VMCS.
*/
- if (vmx->nested.current_vmptr == -1ull &&
+ if (vmx->nested.current_vmptr == INVALID_GPA &&
!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))
return nested_vmx_failInvalid(vcpu);
@@ -218,7 +218,7 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
vmx->nested.need_vmcs12_to_shadow_sync = false;
}
@@ -290,9 +290,10 @@ static void free_nested(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = false;
vmx->nested.smm.vmxon = false;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
if (enable_shadow_vmcs) {
vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
@@ -709,7 +710,7 @@ static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vmcs12 *shadow;
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
return;
shadow = get_shadow_vmcs12(vcpu);
@@ -727,7 +728,7 @@ static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
- vmcs12->vmcs_link_pointer == -1ull)
+ vmcs12->vmcs_link_pointer == INVALID_GPA)
return;
kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
@@ -1994,7 +1995,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
}
if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
nested_release_evmcs(vcpu);
@@ -2178,7 +2179,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
}
if (cpu_has_vmx_encls_vmexit())
- vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
+ vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA);
/*
* Set the MSR load/store lists to match L0's settings. Only the
@@ -2197,7 +2198,7 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
{
prepare_vmcs02_constant_state(vmx);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA);
if (enable_vpid) {
if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
@@ -2583,8 +2584,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Guest state is invalid and unrestricted guest is disabled,
* which means L1 attempted VMEntry to L2 with invalid state.
* Fail the VMEntry.
+ *
+ * However when force loading the guest state (SMM exit or
+ * loading nested state after migration, it is possible to
+ * have invalid guest state now, which will be later fixed by
+ * restoring L2 register state
*/
- if (CC(!vmx_guest_state_valid(vcpu))) {
+ if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -2944,7 +2950,7 @@ static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
struct vmcs12 *shadow;
struct kvm_host_map map;
- if (vmcs12->vmcs_link_pointer == -1ull)
+ if (vmcs12->vmcs_link_pointer == INVALID_GPA)
return 0;
if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
@@ -3211,7 +3217,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
* force VM-Entry to fail.
*/
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA);
}
}
@@ -3522,7 +3528,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
- vmx->nested.current_vmptr == -1ull))
+ vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -4351,6 +4357,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
+
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
@@ -4899,14 +4907,7 @@ out_vmcs02:
return -ENOMEM;
}
-/*
- * Emulate the VMXON instruction.
- * Currently, we just remember that VMX is active, and do not save or even
- * inspect the argument to VMXON (the so-called "VMXON pointer") because we
- * do not currently need to store anything in that guest-allocated memory
- * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
- * argument is different from the VMXON pointer (which the spec says they do).
- */
+/* Emulate the VMXON instruction. */
static int handle_vmon(struct kvm_vcpu *vcpu)
{
int ret;
@@ -4975,7 +4976,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->nested.current_vmptr == -1ull)
+ if (vmx->nested.current_vmptr == INVALID_GPA)
return;
copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
@@ -4995,7 +4996,7 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
- vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmptr = INVALID_GPA;
}
/* Emulate the VMXOFF instruction */
@@ -5090,12 +5091,12 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return 1;
/*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMREAD sets the ALU flags for VMfailInvalid.
*/
- if (vmx->nested.current_vmptr == -1ull ||
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
/* Decode instruction info and find the field to read */
@@ -5182,12 +5183,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
/*
- * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
* any VMWRITE sets the ALU flags for VMfailInvalid.
*/
- if (vmx->nested.current_vmptr == -1ull ||
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
(is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
if (instr_info & BIT(10))
@@ -5630,7 +5631,7 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
gpa_t bitmap, last_bitmap;
u8 b;
- last_bitmap = (gpa_t)-1;
+ last_bitmap = INVALID_GPA;
b = -1;
while (size > 0) {
@@ -5903,6 +5904,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_VMFUNC:
/* VM functions are emulated through L2->L0 vmexits. */
return true;
+ case EXIT_REASON_BUS_LOCK:
+ /*
+ * At present, bus lock VM exit is never exposed to L1.
+ * Handle L2's bus locks in L0 directly.
+ */
+ return true;
default:
break;
}
@@ -6059,7 +6066,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
goto reflect_vmexit;
}
- trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX);
/* If L0 (KVM) wants the exit, it trumps L1's desires. */
if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
@@ -6100,8 +6107,8 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
.format = KVM_STATE_NESTED_FORMAT_VMX,
.size = sizeof(kvm_state),
.hdr.vmx.flags = 0,
- .hdr.vmx.vmxon_pa = -1ull,
- .hdr.vmx.vmcs12_pa = -1ull,
+ .hdr.vmx.vmxon_pa = INVALID_GPA,
+ .hdr.vmx.vmcs12_pa = INVALID_GPA,
.hdr.vmx.preemption_timer_deadline = 0,
};
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
@@ -6127,7 +6134,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (is_guest_mode(vcpu) &&
nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull)
+ vmcs12->vmcs_link_pointer != INVALID_GPA)
kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
}
@@ -6203,7 +6210,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
@@ -6238,11 +6245,11 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) {
if (kvm_state->hdr.vmx.smm.flags)
return -EINVAL;
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)
return -EINVAL;
/*
@@ -6296,7 +6303,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmx_leave_nested(vcpu);
- if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
+ if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA)
return 0;
vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
@@ -6309,13 +6316,13 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
/* See vmx_has_valid_vmcs12. */
if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
(kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
- (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA))
return -EINVAL;
else
return 0;
}
- if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
+ if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
!page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
return -EINVAL;
@@ -6360,7 +6367,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
- vmcs12->vmcs_link_pointer != -1ull) {
+ vmcs12->vmcs_link_pointer != INVALID_GPA) {
struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
if (kvm_state->size <
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 10cc4f65c4ef..b8e0d21b7c8a 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -365,7 +365,7 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = pmu->global_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- msr_info->data = pmu->global_ovf_ctrl;
+ msr_info->data = 0;
return 0;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
@@ -423,7 +423,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & pmu->global_ovf_ctrl_mask)) {
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
return 0;
}
break;
@@ -588,8 +587,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu)
pmc->counter = 0;
}
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0;
intel_pmu_release_guest_lbr_event(vcpu);
}
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6693ebdc0770..35e7ec91ae86 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -53,11 +53,9 @@ static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
static void sgx_handle_emulation_failure(struct kvm_vcpu *vcpu, u64 addr,
unsigned int size)
{
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = addr;
- vcpu->run->internal.data[1] = size;
+ uint64_t data[2] = { addr, size };
+
+ __kvm_prepare_emulation_failure_exit(vcpu, data, ARRAY_SIZE(data));
}
static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
@@ -112,9 +110,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
* but the error code isn't (yet) plumbed through the ENCLS helpers.
*/
if (trapnr == PF_VECTOR && !boot_cpu_has(X86_FEATURE_SGX2)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -155,9 +151,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
sgx_12_0 = kvm_find_cpuid_entry(vcpu, 0x12, 0);
sgx_12_1 = kvm_find_cpuid_entry(vcpu, 0x12, 1);
if (!sgx_12_0 || !sgx_12_1) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0c2c0d5ae873..76861b66bbcf 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -35,7 +35,7 @@
#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
-#include <asm/fpu/internal.h>
+#include <asm/fpu/api.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
@@ -1059,8 +1059,8 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
}
}
@@ -1070,12 +1070,16 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
+ pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
+ pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
}
- /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ /*
+ * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
+ * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
+ */
+ if (vmx->pt_desc.host.ctl)
+ wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
@@ -1323,7 +1327,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
-static bool emulation_required(struct kvm_vcpu *vcpu)
+bool vmx_emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
@@ -1367,7 +1371,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->emulation_required = vmx_emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1456,16 +1460,16 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
- if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
+ if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2))
return 1;
return 0;
@@ -1837,10 +1841,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&msr_info->data))
return 1;
/*
- * Enlightened VMCS v1 doesn't have certain fields, but buggy
- * Hyper-V versions are still trying to use corresponding
- * features when they are exposed. Filter out the essential
- * minimum.
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
*/
if (!msr_info->host_initiated &&
vmx->nested.enlightened_vmcs_enabled)
@@ -1885,8 +1890,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if (!vmx_pt_mode_is_host_guest() ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ (index >= 2 * vmx->pt_desc.num_address_ranges))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
@@ -2201,8 +2205,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!pt_can_write_msr(vmx))
return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges))
+ if (index >= 2 * vmx->pt_desc.num_address_ranges)
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
@@ -3077,7 +3080,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
/* depends on vcpu->arch.cr0 to be set to a new value */
- vmx->emulation_required = emulation_required(vcpu);
+ vmx->emulation_required = vmx_emulation_required(vcpu);
}
static int vmx_get_max_tdp_level(void)
@@ -3330,7 +3333,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int
{
__vmx_set_segment(vcpu, var, seg);
- to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+ to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3878,7 +3881,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
- for (i = 0; i < vmx->pt_desc.addr_range; i++) {
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) {
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
@@ -4327,10 +4330,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
#define VMX_XSS_EXIT_BITMAP 0
-/*
- * Noting that the initialization of Guest-state Area of VMCS is in
- * vmx_vcpu_reset().
- */
static void init_vmcs(struct vcpu_vmx *vmx)
{
if (nested)
@@ -4339,7 +4338,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
- vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
+ vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
@@ -4435,10 +4434,40 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx_setup_uret_msrs(vmx);
}
+static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ init_vmcs(vmx);
+
+ if (nested)
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
+
+ vcpu_setup_sgx_lepubkeyhash(vcpu);
+
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.vmxon_ptr = INVALID_GPA;
+ vmx->nested.current_vmptr = INVALID_GPA;
+ vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
+
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
+
+ /*
+ * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
+ * or POSTED_INTR_WAKEUP_VECTOR.
+ */
+ vmx->pi_desc.nv = POSTED_INTR_VECTOR;
+ vmx->pi_desc.sn = 1;
+}
+
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!init_event)
+ __vmx_vcpu_reset(vcpu);
+
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
@@ -4448,6 +4477,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_set_cr8(vcpu, 0);
vmx_segment_cache_clear(vmx);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS);
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
@@ -5378,10 +5408,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (vmx->emulation_required && !vmx->rmode.vm86_active &&
vcpu->arch.exception.pending) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
- KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5561,9 +5588,13 @@ static int handle_encls(struct kvm_vcpu *vcpu)
static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
{
- vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
- vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK;
- return 0;
+ /*
+ * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
+ * VM-Exits. Unconditionally set the flag here and leave the handling to
+ * vmx_handle_exit().
+ */
+ to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
+ return 1;
}
/*
@@ -5628,11 +5659,13 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2,
u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ *reason = vmx->exit_reason.full;
*info1 = vmx_get_exit_qual(vcpu);
if (!(vmx->exit_reason.failed_vmentry)) {
*info2 = vmx->idt_vectoring_info;
@@ -6050,9 +6083,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
/*
- * Even when current exit reason is handled by KVM internally, we
- * still need to exit to user space when bus lock detected to inform
- * that there is a bus lock in guest.
+ * Exit to user space when bus lock detected to inform that there is
+ * a bus lock in guest.
*/
if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
if (ret > 0)
@@ -6301,18 +6333,13 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
/*
* If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
+ * which can be injected, this may cause a vmexit or it may
+ * be injected into L2. Either way, this interrupt will be
+ * processed via KVM_REQ_EVENT, not RVI, because we do not use
+ * virtual interrupt delivery to inject L1 interrupts into L2.
*/
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ if (is_guest_mode(vcpu) && max_irr_updated)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
}
@@ -6407,6 +6434,7 @@ static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return nested;
case MSR_AMD64_VIRT_SPEC_CTRL:
+ case MSR_AMD64_TSC_RATIO:
/* This is AMD only. */
return false;
default:
@@ -6621,10 +6649,24 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->soft_vnmi_blocked))
vmx->loaded_vmcs->entry_time = ktime_get();
- /* Don't enter VMX if guest state is invalid, let the exit handler
- start emulation until we arrive back to a valid state */
- if (vmx->emulation_required)
+ /*
+ * Don't enter VMX if guest state is invalid, let the exit handler
+ * start emulation until we arrive back to a valid state. Synthesize a
+ * consistency check VM-Exit due to invalid guest state and bail.
+ */
+ if (unlikely(vmx->emulation_required)) {
+
+ /* We don't emulate invalid state of a nested guest */
+ vmx->fail = is_guest_mode(vcpu);
+
+ vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
+ vmx->exit_reason.failed_vmentry = 1;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->exit_intr_info = 0;
return EXIT_FASTPATH_NONE;
+ }
trace_kvm_entry(vcpu);
@@ -6769,7 +6811,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (likely(!vmx->exit_reason.failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
- trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
+ trace_kvm_exit(vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
@@ -6800,7 +6842,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
- int i, cpu, err;
+ int i, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
@@ -6821,10 +6863,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vpid;
}
- for (i = 0; i < kvm_nr_uret_msrs; ++i) {
- vmx->guest_uret_msrs[i].data = 0;
+ for (i = 0; i < kvm_nr_uret_msrs; ++i)
vmx->guest_uret_msrs[i].mask = -1ull;
- }
if (boot_cpu_has(X86_FEATURE_RTM)) {
/*
* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
@@ -6833,7 +6873,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
*/
tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (tsx_ctrl)
- vmx->guest_uret_msrs[i].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
@@ -6861,12 +6901,7 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
}
vmx->loaded_vmcs = &vmx->vmcs01;
- cpu = get_cpu();
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- init_vmcs(vmx);
- vmx_vcpu_put(vcpu);
- put_cpu();
+
if (cpu_need_virtualize_apic_accesses(vcpu)) {
err = alloc_apic_access_page(vcpu->kvm);
if (err)
@@ -6879,27 +6914,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
- if (nested)
- memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
- else
- memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
-
- vcpu_setup_sgx_lepubkeyhash(vcpu);
-
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
-
- vcpu->arch.microcode_version = 0x100000000ULL;
- vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
-
- /*
- * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
- * or POSTED_INTR_WAKEUP_VECTOR.
- */
- vmx->pi_desc.nv = POSTED_INTR_VECTOR;
- vmx->pi_desc.sn = 1;
-
return 0;
free_vmcs:
@@ -7114,12 +7128,13 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
}
/* Get the number of configurable Address Ranges for filtering */
- vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps,
+ vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges);
/* Initialize and clear the no dependency bits */
vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS |
- RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC);
+ RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC |
+ RTIT_CTL_BRANCH_EN);
/*
* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
@@ -7137,12 +7152,11 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ);
/*
- * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and
- * MTCFreq can be set
+ * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
*/
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc))
vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN |
- RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE);
+ RTIT_CTL_MTC_RANGE);
/* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite))
@@ -7162,7 +7176,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
/* unmask address range configure area */
- for (i = 0; i < vmx->pt_desc.addr_range; i++)
+ for (i = 0; i < vmx->pt_desc.num_address_ranges; i++)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
@@ -7538,6 +7552,8 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
static void hardware_unsetup(void)
{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
if (nested)
nested_vmx_hardware_unsetup();
@@ -7553,6 +7569,8 @@ static bool vmx_check_apicv_inhibit_reasons(ulong bit)
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .name = "kvm_intel",
+
.hardware_unsetup = hardware_unsetup,
.hardware_enable = hardware_enable,
@@ -7866,8 +7884,6 @@ static __init int hardware_setup(void)
vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7891,6 +7907,9 @@ static __init int hardware_setup(void)
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4858c5fd95f2..e7db42e3b0ce 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -62,7 +62,7 @@ struct pt_ctx {
struct pt_desc {
u64 ctl_bitmask;
- u32 addr_range;
+ u32 num_address_ranges;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
struct pt_ctx host;
struct pt_ctx guest;
@@ -248,12 +248,8 @@ struct vcpu_vmx {
* only loaded into hardware when necessary, e.g. SYSCALL #UDs outside
* of 64-bit mode or if EFER.SCE=1, thus the SYSCALL MSRs don't need to
* be loaded into hardware if those conditions aren't met.
- * nr_active_uret_msrs tracks the number of MSRs that need to be loaded
- * into hardware when running the guest. guest_uret_msrs[] is resorted
- * whenever the number of "active" uret MSRs is modified.
*/
struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
- int nr_active_uret_msrs;
bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
@@ -359,6 +355,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
+bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 28ef14155726..c1c4e2b05a63 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -68,7 +68,9 @@
#include <asm/mce.h>
#include <asm/pkru.h>
#include <linux/kernel_stat.h>
-#include <asm/fpu/internal.h> /* Ugh! */
+#include <asm/fpu/api.h>
+#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/pvclock.h>
#include <asm/div64.h>
#include <asm/irq_remapping.h>
@@ -293,8 +295,6 @@ u64 __read_mostly host_xcr0;
u64 __read_mostly supported_xcr0;
EXPORT_SYMBOL_GPL(supported_xcr0);
-static struct kmem_cache *x86_fpu_cache;
-
static struct kmem_cache *x86_emulator_cache;
/*
@@ -790,30 +790,6 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
}
EXPORT_SYMBOL_GPL(kvm_require_dr);
-/*
- * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_vcpu_read_guest_page is that this function
- * can read from guest physical or from the guest's guest physical memory.
- */
-int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gfn_t ngfn, void *data, int offset, int len,
- u32 access)
-{
- struct x86_exception exception;
- gfn_t real_gfn;
- gpa_t ngpa;
-
- ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
- if (real_gfn == UNMAPPED_GVA)
- return -EFAULT;
-
- real_gfn = gpa_to_gfn(real_gfn);
-
- return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -825,34 +801,38 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
+ gpa_t real_gpa;
int i;
int ret;
u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
- ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
- offset * sizeof(u64), sizeof(pdpte),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
- if (ret < 0) {
- ret = 0;
- goto out;
- }
+ /*
+ * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
+ * to an L1 GPA.
+ */
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(pdpt_gfn),
+ PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
+ if (real_gpa == UNMAPPED_GVA)
+ return 0;
+
+ /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
+ cr3 & GENMASK(11, 5), sizeof(pdpte));
+ if (ret < 0)
+ return 0;
+
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
- ret = 0;
- goto out;
+ return 0;
}
}
- ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
vcpu->arch.pdptrs_from_userspace = false;
-out:
-
- return ret;
+ return 1;
}
EXPORT_SYMBOL_GPL(load_pdptrs);
@@ -993,7 +973,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/*
* Do not allow the guest to set bits that we do not support
* saving. However, xcr0 bit 0 is always set, even if the
- * emulated CPU does not support XSAVE (see fx_init).
+ * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
*/
valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
@@ -1042,9 +1022,28 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
- (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
+ /*
+ * If any role bit is changed, the MMU needs to be reset.
+ *
+ * If CR4.PCIDE is changed 1 -> 0, the guest TLB must be flushed.
+ * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
+ * according to the SDM; however, stale prev_roots could be reused
+ * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
+ * free them all. KVM_REQ_MMU_RELOAD is fit for the both cases; it
+ * is slow, but changing CR4.PCIDE is a rare case.
+ *
+ * If CR4.PGE is changed, the guest TLB must be flushed.
+ *
+ * Note: resetting MMU is a superset of KVM_REQ_MMU_RELOAD and
+ * KVM_REQ_MMU_RELOAD is a superset of KVM_REQ_TLB_FLUSH_GUEST, hence
+ * the usage of "else if".
+ */
+ if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PCIDE)
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ else if ((cr4 ^ old_cr4) & X86_CR4_PGE)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
@@ -1092,6 +1091,18 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
int i;
/*
+ * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
+ * this is reachable when running EPT=1 and unrestricted_guest=0, and
+ * also via the emulator. KVM's TDP page tables are not in the scope of
+ * the invalidation, but the guest's TLB entries need to be flushed as
+ * the CPU may have cached entries in its TLB for the target PCID.
+ */
+ if (unlikely(tdp_enabled)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
+ }
+
+ /*
* If neither the current CR3 nor any of the prev_roots use the given
* PCID, then nothing needs to be done here because a resync will
* happen anyway before switching to any other CR3.
@@ -1101,6 +1112,14 @@ static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
+ /*
+ * If PCID is disabled, there is no need to free prev_roots even if the
+ * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
+ * with PCIDE=0.
+ */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return;
+
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
@@ -1332,6 +1351,13 @@ static const u32 msrs_to_save_all[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1374,6 +1400,7 @@ static const u32 emulated_msrs_all[] = {
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
MSR_IA32_POWER_CTL,
MSR_IA32_UCODE_REV,
@@ -2447,13 +2474,64 @@ static inline bool kvm_check_tsc_unstable(void)
return check_tsc_unstable();
}
+/*
+ * Infers attempts to synchronize the guest's tsc from host writes. Sets the
+ * offset for the vcpu and tracks the TSC matching generation that the vcpu
+ * participates in.
+ */
+static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
+ u64 ns, bool matched)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
+
+ /*
+ * We also track th most recent recorded KHZ, write and time to
+ * allow the matching interval to be extended at each write.
+ */
+ kvm->arch.last_tsc_nsec = ns;
+ kvm->arch.last_tsc_write = tsc;
+ kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ kvm->arch.last_tsc_offset = offset;
+
+ vcpu->arch.last_guest_tsc = tsc;
+
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+
+ if (!matched) {
+ /*
+ * We split periods of matched TSC writes into generations.
+ * For each generation, we track the original measured
+ * nanosecond time, offset, and write, so if TSCs are in
+ * sync, we can match exact offset, and if not, we can match
+ * exact software computation in compute_guest_tsc()
+ *
+ * These values are tracked in kvm->arch.cur_xxx variables.
+ */
+ kvm->arch.cur_tsc_generation++;
+ kvm->arch.cur_tsc_nsec = ns;
+ kvm->arch.cur_tsc_write = tsc;
+ kvm->arch.cur_tsc_offset = offset;
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
+
+ /* Keep track of which generation this VCPU has synchronized to */
+ vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
+ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
+ vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+
+ kvm_track_tsc_matching(vcpu);
+}
+
static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
- bool matched;
- bool already_matched;
+ bool matched = false;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -2499,51 +2577,10 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
- already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
- } else {
- /*
- * We split periods of matched TSC writes into generations.
- * For each generation, we track the original measured
- * nanosecond time, offset, and write, so if TSCs are in
- * sync, we can match exact offset, and if not, we can match
- * exact software computation in compute_guest_tsc()
- *
- * These values are tracked in kvm->arch.cur_xxx variables.
- */
- kvm->arch.cur_tsc_generation++;
- kvm->arch.cur_tsc_nsec = ns;
- kvm->arch.cur_tsc_write = data;
- kvm->arch.cur_tsc_offset = offset;
- matched = false;
}
- /*
- * We also track th most recent recorded KHZ, write and time to
- * allow the matching interval to be extended at each write.
- */
- kvm->arch.last_tsc_nsec = ns;
- kvm->arch.last_tsc_write = data;
- kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
-
- vcpu->arch.last_guest_tsc = data;
-
- /* Keep track of which generation this VCPU has synchronized to */
- vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
- vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
- vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
-
- kvm_vcpu_write_tsc_offset(vcpu, offset);
+ __kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
-
- spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags);
- if (!matched) {
- kvm->arch.nr_vcpus_matched_tsc = 0;
- } else if (!already_matched) {
- kvm->arch.nr_vcpus_matched_tsc++;
- }
-
- kvm_track_tsc_matching(vcpu);
- spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags);
}
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
@@ -2731,6 +2768,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
int vclock_mode;
bool host_tsc_clocksource, vcpus_matched;
+ lockdep_assert_held(&kvm->arch.tsc_write_lock);
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&kvm->online_vcpus));
@@ -2755,68 +2793,101 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
-void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
}
-static void kvm_gen_update_masterclock(struct kvm *kvm)
+static void __kvm_start_pvclock_update(struct kvm *kvm)
{
-#ifdef CONFIG_X86_64
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm_arch *ka = &kvm->arch;
- unsigned long flags;
-
- kvm_hv_invalidate_tsc_page(kvm);
+ raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
+ write_seqcount_begin(&kvm->arch.pvclock_sc);
+}
+static void kvm_start_pvclock_update(struct kvm *kvm)
+{
kvm_make_mclock_inprogress_request(kvm);
/* no guest entries from this point */
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
+}
+
+static void kvm_end_pvclock_update(struct kvm *kvm)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_vcpu *vcpu;
+ int i;
+ write_seqcount_end(&ka->pvclock_sc);
+ raw_spin_unlock_irq(&ka->tsc_write_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
-#endif
}
-u64 get_kvmclock_ns(struct kvm *kvm)
+static void kvm_update_masterclock(struct kvm *kvm)
+{
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+ kvm_end_pvclock_update(kvm);
+}
+
+/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
+static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{
struct kvm_arch *ka = &kvm->arch;
struct pvclock_vcpu_time_info hv_clock;
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- if (!ka->use_master_clock) {
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
- return get_kvmclock_base_ns() + ka->kvmclock_offset;
- }
-
- hv_clock.tsc_timestamp = ka->master_cycle_now;
- hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
- if (__this_cpu_read(cpu_tsc_khz)) {
+ data->flags = 0;
+ if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
+ struct timespec64 ts;
+
+ if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+ data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+ data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+ } else
+#endif
+ data->host_tsc = rdtsc();
+
+ data->flags |= KVM_CLOCK_TSC_STABLE;
+ hv_clock.tsc_timestamp = ka->master_cycle_now;
+ hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
&hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul);
- ret = __pvclock_read_cycles(&hv_clock, rdtsc());
- } else
- ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+ } else {
+ data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+ }
put_cpu();
+}
- return ret;
+static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ __get_kvmclock(kvm, data);
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_clock_data data;
+
+ get_kvmclock(kvm, &data);
+ return data.clock;
}
static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -2881,6 +2952,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
+ unsigned seq;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -2895,13 +2967,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
* If the host uses TSC clock, then passthrough TSC as stable
* to the guest.
*/
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
- use_master_clock = ka->use_master_clock;
- if (use_master_clock) {
- host_tsc = ka->master_cycle_now;
- kernel_ns = ka->master_kernel_ns;
- }
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
+ do {
+ seq = read_seqcount_begin(&ka->pvclock_sc);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ } while (read_seqcount_retry(&ka->pvclock_sc, seq));
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
@@ -2969,7 +3042,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
offsetof(struct compat_vcpu_info, time));
if (vcpu->xen.vcpu_time_info_set)
kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
- if (v == kvm_get_vcpu(v->kvm, 0))
+ if (!v->vcpu_idx)
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -3172,15 +3245,14 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.tlb_flush;
if (!tdp_enabled) {
- /*
+ /*
* A TLB flush on behalf of the guest is equivalent to
* INVPCID(all), toggling CR4.PGE, etc., which requires
- * a forced sync of the shadow page tables. Unload the
- * entire MMU here and the subsequent load will sync the
- * shadow page tables, and also flush the TLB.
+ * a forced sync of the shadow page tables. Ensure all the
+ * roots are synced and the guest TLB in hardware is clean.
*/
- kvm_mmu_unload(vcpu);
- return;
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_sync_prev_roots(vcpu);
}
static_call(kvm_x86_tlb_flush_guest)(vcpu);
@@ -4021,6 +4093,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4041,7 +4114,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_SYNC_X86_VALID_FIELDS;
break;
case KVM_CAP_ADJUST_CLOCK:
- r = KVM_CLOCK_TSC_STABLE;
+ r = KVM_CLOCK_VALID_FLAGS;
break;
case KVM_CAP_X86_DISABLE_EXITS:
r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
@@ -4070,7 +4143,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_MAX_VCPU_ID:
- r = KVM_MAX_VCPU_ID;
+ r = KVM_MAX_VCPU_IDS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -4693,144 +4766,27 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return 0;
}
-#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
-
-static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
-{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = xsave->header.xfeatures;
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(dest, xsave, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV */
- xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE;
- *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
-
- /*
- * Copy each region from the possibly compacted offset to the
- * non-compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u32 size, offset, ecx, edx;
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
- void *src;
-
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
-
- if (xfeature_nr == XFEATURE_PKRU) {
- memcpy(dest + offset, &vcpu->arch.pkru,
- sizeof(vcpu->arch.pkru));
- } else {
- src = get_xsave_addr(xsave, xfeature_nr);
- if (src)
- memcpy(dest + offset, src, size);
- }
-
- valid -= xfeature_mask;
- }
-}
-
-static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
-{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
- u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
- u64 valid;
-
- /*
- * Copy legacy XSAVE area, to avoid complications with CPUID
- * leaves 0 and 1 in the loop below.
- */
- memcpy(xsave, src, XSAVE_HDR_OFFSET);
-
- /* Set XSTATE_BV and possibly XCOMP_BV. */
- xsave->header.xfeatures = xstate_bv;
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Copy each region from the non-compacted offset to the
- * possibly compacted offset.
- */
- valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
- while (valid) {
- u32 size, offset, ecx, edx;
- u64 xfeature_mask = valid & -valid;
- int xfeature_nr = fls64(xfeature_mask) - 1;
-
- cpuid_count(XSTATE_CPUID, xfeature_nr,
- &size, &offset, &ecx, &edx);
-
- if (xfeature_nr == XFEATURE_PKRU) {
- memcpy(&vcpu->arch.pkru, src + offset,
- sizeof(vcpu->arch.pkru));
- } else {
- void *dest = get_xsave_addr(xsave, xfeature_nr);
-
- if (dest)
- memcpy(dest, src + offset, size);
- }
-
- valid -= xfeature_mask;
- }
-}
-
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return;
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- memset(guest_xsave, 0, sizeof(struct kvm_xsave));
- fill_xsave((u8 *) guest_xsave->region, vcpu);
- } else {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu->state.fxsave,
- sizeof(struct fxregs_state));
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XFEATURE_MASK_FPSSE;
- }
+ fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ sizeof(guest_xsave->region),
+ vcpu->arch.pkru);
}
-#define XSAVE_MXCSR_OFFSET 24
-
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
- u64 xstate_bv;
- u32 mxcsr;
-
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
- xstate_bv = *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
- mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
-
- if (boot_cpu_has(X86_FEATURE_XSAVE)) {
- /*
- * Here we allow setting states that are not present in
- * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
- * with old userspace.
- */
- if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- load_xsave(vcpu, (u8 *)guest_xsave->region);
- } else {
- if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
- mxcsr & ~mxcsr_feature_mask)
- return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu->state.fxsave,
- guest_xsave->region, sizeof(struct fxregs_state));
- }
- return 0;
+ return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
+ guest_xsave->region,
+ supported_xcr0, &vcpu->arch.pkru);
}
static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
@@ -4885,6 +4841,115 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ int r;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET:
+ r = -EFAULT;
+ if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
+ break;
+ r = 0;
+ break;
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
+ struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ struct kvm *kvm = vcpu->kvm;
+ int r;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return -EFAULT;
+
+ switch (attr->attr) {
+ case KVM_VCPU_TSC_OFFSET: {
+ u64 offset, tsc, ns;
+ unsigned long flags;
+ bool matched;
+
+ r = -EFAULT;
+ if (get_user(offset, uaddr))
+ break;
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
+
+ matched = (vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
+ kvm->arch.last_tsc_offset == offset);
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
+ ns = get_kvmclock_base_ns();
+
+ __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ r = 0;
+ break;
+ }
+ default:
+ r = -ENXIO;
+ }
+
+ return r;
+}
+
+static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
+ unsigned int ioctl,
+ void __user *argp)
+{
+ struct kvm_device_attr attr;
+ int r;
+
+ if (copy_from_user(&attr, argp, sizeof(attr)))
+ return -EFAULT;
+
+ if (attr.group != KVM_VCPU_TSC_CTRL)
+ return -ENXIO;
+
+ switch (ioctl) {
+ case KVM_HAS_DEVICE_ATTR:
+ r = kvm_arch_tsc_has_attr(vcpu, &attr);
+ break;
+ case KVM_GET_DEVICE_ATTR:
+ r = kvm_arch_tsc_get_attr(vcpu, &attr);
+ break;
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_arch_tsc_set_attr(vcpu, &attr);
+ break;
+ }
+
+ return r;
+}
+
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
@@ -5339,6 +5404,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = __set_sregs2(vcpu, u.sregs2);
break;
}
+ case KVM_HAS_DEVICE_ATTR:
+ case KVM_GET_DEVICE_ATTR:
+ case KVM_SET_DEVICE_ATTR:
+ r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
+ break;
default:
r = -EINVAL;
}
@@ -5822,6 +5892,63 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_clock_data data = { 0 };
+
+ get_kvmclock(kvm, &data);
+ if (copy_to_user(argp, &data, sizeof(data)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_arch *ka = &kvm->arch;
+ struct kvm_clock_data data;
+ u64 now_raw_ns;
+
+ if (copy_from_user(&data, argp, sizeof(data)))
+ return -EFAULT;
+
+ /*
+ * Only KVM_CLOCK_REALTIME is used, but allow passing the
+ * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
+ */
+ if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
+ return -EINVAL;
+
+ kvm_hv_invalidate_tsc_page(kvm);
+ kvm_start_pvclock_update(kvm);
+ pvclock_update_vm_gtod_copy(kvm);
+
+ /*
+ * This pairs with kvm_guest_time_update(): when masterclock is
+ * in use, we use master_kernel_ns + kvmclock_offset to set
+ * unsigned 'system_time' so if we use get_kvmclock_ns() (which
+ * is slightly ahead) here we risk going negative on unsigned
+ * 'system_time' when 'data.clock' is very small.
+ */
+ if (data.flags & KVM_CLOCK_REALTIME) {
+ u64 now_real_ns = ktime_get_real_ns();
+
+ /*
+ * Avoid stepping the kvmclock backwards.
+ */
+ if (now_real_ns > data.realtime)
+ data.clock += now_real_ns - data.realtime;
+ }
+
+ if (ka->use_master_clock)
+ now_raw_ns = ka->master_kernel_ns;
+ else
+ now_raw_ns = get_kvmclock_base_ns();
+ ka->kvmclock_offset = data.clock - now_raw_ns;
+ kvm_end_pvclock_update(kvm);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -6065,60 +6192,12 @@ set_pit2_out:
break;
}
#endif
- case KVM_SET_CLOCK: {
- struct kvm_arch *ka = &kvm->arch;
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- r = -EFAULT;
- if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
- goto out;
-
- r = -EINVAL;
- if (user_ns.flags)
- goto out;
-
- r = 0;
- /*
- * TODO: userspace has to take care of races with VCPU_RUN, so
- * kvm_gen_update_masterclock() can be cut down to locked
- * pvclock_update_vm_gtod_copy().
- */
- kvm_gen_update_masterclock(kvm);
-
- /*
- * This pairs with kvm_guest_time_update(): when masterclock is
- * in use, we use master_kernel_ns + kvmclock_offset to set
- * unsigned 'system_time' so if we use get_kvmclock_ns() (which
- * is slightly ahead) here we risk going negative on unsigned
- * 'system_time' when 'user_ns.clock' is very small.
- */
- spin_lock_irq(&ka->pvclock_gtod_sync_lock);
- if (kvm->arch.use_master_clock)
- now_ns = ka->master_kernel_ns;
- else
- now_ns = get_kvmclock_base_ns();
- ka->kvmclock_offset = user_ns.clock - now_ns;
- spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
-
- kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
+ case KVM_SET_CLOCK:
+ r = kvm_vm_ioctl_set_clock(kvm, argp);
break;
- }
- case KVM_GET_CLOCK: {
- struct kvm_clock_data user_ns;
- u64 now_ns;
-
- now_ns = get_kvmclock_ns(kvm);
- user_ns.clock = now_ns;
- user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
- memset(&user_ns.pad, 0, sizeof(user_ns.pad));
-
- r = -EFAULT;
- if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
- goto out;
- r = 0;
+ case KVM_GET_CLOCK:
+ r = kvm_vm_ioctl_get_clock(kvm, argp);
break;
- }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (kvm_x86_ops.mem_enc_op)
@@ -6899,7 +6978,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
}
static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val,
+ unsigned short port,
unsigned int count, bool in)
{
vcpu->arch.pio.port = port;
@@ -6907,10 +6986,8 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- vcpu->arch.pio.count = 0;
+ if (!kernel_pio(vcpu, vcpu->arch.pio_data))
return 1;
- }
vcpu->run->exit_reason = KVM_EXIT_IO;
vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -6922,26 +6999,39 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
return 0;
}
-static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
- unsigned short port, void *val, unsigned int count)
+static int __emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, unsigned int count)
{
- int ret;
+ WARN_ON(vcpu->arch.pio.count);
+ memset(vcpu->arch.pio_data, 0, size * count);
+ return emulator_pio_in_out(vcpu, size, port, count, true);
+}
- if (vcpu->arch.pio.count)
- goto data_avail;
+static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
+{
+ int size = vcpu->arch.pio.size;
+ unsigned count = vcpu->arch.pio.count;
+ memcpy(val, vcpu->arch.pio_data, size * count);
+ trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
+ vcpu->arch.pio.count = 0;
+}
- memset(vcpu->arch.pio_data, 0, size * count);
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
+{
+ if (vcpu->arch.pio.count) {
+ /* Complete previous iteration. */
+ } else {
+ int r = __emulator_pio_in(vcpu, size, port, count);
+ if (!r)
+ return r;
- ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
- if (ret) {
-data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
- trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
- vcpu->arch.pio.count = 0;
- return 1;
+ /* Results already available, fall through. */
}
- return 0;
+ WARN_ON(count != vcpu->arch.pio.count);
+ complete_emulator_pio_in(vcpu, val);
+ return 1;
}
static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
@@ -6956,9 +7046,15 @@ static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port, const void *val,
unsigned int count)
{
+ int ret;
+
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
- return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+ ret = emulator_pio_in_out(vcpu, size, port, count, false);
+ if (ret)
+ vcpu->arch.pio.count = 0;
+
+ return ret;
}
static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
@@ -7468,29 +7564,78 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
-static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata, u8 *insn_bytes, u8 insn_size)
{
- struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
struct kvm_run *run = vcpu->run;
+ u64 info[5];
+ u8 info_start;
+
+ /*
+ * Zero the whole array used to retrieve the exit info, as casting to
+ * u32 for select entries will leave some chunks uninitialized.
+ */
+ memset(&info, 0, sizeof(info));
+
+ static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
+ &info[2], (u32 *)&info[3],
+ (u32 *)&info[4]);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
- run->emulation_failure.ndata = 0;
+
+ /*
+ * There's currently space for 13 entries, but 5 are used for the exit
+ * reason and info. Restrict to 4 to reduce the maintenance burden
+ * when expanding kvm_run.emulation_failure in the future.
+ */
+ if (WARN_ON_ONCE(ndata > 4))
+ ndata = 4;
+
+ /* Always include the flags as a 'data' entry. */
+ info_start = 1;
run->emulation_failure.flags = 0;
if (insn_size) {
- run->emulation_failure.ndata = 3;
+ BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
+ sizeof(run->emulation_failure.insn_bytes) != 16));
+ info_start += 2;
run->emulation_failure.flags |=
KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
run->emulation_failure.insn_size = insn_size;
memset(run->emulation_failure.insn_bytes, 0x90,
sizeof(run->emulation_failure.insn_bytes));
- memcpy(run->emulation_failure.insn_bytes,
- ctxt->fetch.data, insn_size);
+ memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
}
+
+ memcpy(&run->internal.data[info_start], info, sizeof(info));
+ memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
+ ndata * sizeof(data[0]));
+
+ run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
}
+static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
+ ctxt->fetch.end - ctxt->fetch.data);
+}
+
+void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
+ u8 ndata)
+{
+ prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
+
+void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
struct kvm *kvm = vcpu->kvm;
@@ -7505,16 +7650,14 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
if (kvm->arch.exit_on_emulation_error ||
(emulation_type & EMULTYPE_SKIP)) {
- prepare_emulation_failure_exit(vcpu);
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ prepare_emulation_ctxt_failure_exit(vcpu);
return 0;
}
@@ -7658,6 +7801,13 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * Even if KVM_SET_SREGS2 loaded PDPTRs out of band,
+ * on SMM exit we still need to reload them from
+ * guest memory
+ */
+ vcpu->arch.pdptrs_from_userspace = false;
}
kvm_mmu_reset_context(vcpu);
@@ -8107,14 +8257,13 @@ static void tsc_khz_changed(void *data)
static void kvm_hyperv_tsc_notifier(void)
{
struct kvm *kvm;
- struct kvm_vcpu *vcpu;
int cpu;
- unsigned long flags;
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */
@@ -8123,18 +8272,11 @@ static void kvm_hyperv_tsc_notifier(void)
kvm_max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) {
- struct kvm_arch *ka = &kvm->arch;
-
- spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
+ __kvm_start_pvclock_update(kvm);
pvclock_update_vm_gtod_copy(kvm);
- spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-
- kvm_for_each_vcpu(cpu, vcpu, kvm)
- kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
+ kvm_end_pvclock_update(kvm);
}
+
mutex_unlock(&kvm_lock);
}
#endif
@@ -8375,18 +8517,20 @@ int kvm_arch_init(void *opaque)
int r;
if (kvm_x86_ops.hardware_enable) {
- printk(KERN_ERR "kvm: already loaded the other module\n");
+ pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support for '%s'\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
+ ops->runtime_ops->name);
r = -EOPNOTSUPP;
goto out;
}
@@ -8403,18 +8547,11 @@ int kvm_arch_init(void *opaque)
}
r = -ENOMEM;
- x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
- __alignof__(struct fpu), SLAB_ACCOUNT,
- NULL);
- if (!x86_fpu_cache) {
- printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
- goto out;
- }
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("kvm: failed to allocate cache for x86 emulator\n");
- goto out_free_x86_fpu_cache;
+ goto out;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
@@ -8452,8 +8589,6 @@ out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
kmem_cache_destroy(x86_emulator_cache);
-out_free_x86_fpu_cache:
- kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
@@ -8480,7 +8615,6 @@ void kvm_arch_exit(void)
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
- kmem_cache_destroy(x86_fpu_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
@@ -8581,7 +8715,7 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
static void kvm_apicv_init(struct kvm *kvm)
{
- mutex_init(&kvm->arch.apicv_update_lock);
+ init_rwsem(&kvm->arch.apicv_update_lock);
if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
@@ -8769,9 +8903,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
+
+ /*
+ * The call to kvm_ready_for_interrupt_injection() may end up in
+ * kvm_xen_has_interrupt() which may require the srcu lock to be
+ * held, to protect against changes in the vcpu_info address.
+ */
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
kvm_run->ready_for_interrupt_injection =
pic_in_kernel(vcpu->kvm) ||
kvm_vcpu_ready_for_interrupt_injection(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (is_smm(vcpu))
kvm_run->flags |= KVM_RUN_X86_SMM;
@@ -9228,14 +9370,7 @@ static void process_smi(struct kvm_vcpu *vcpu)
void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
unsigned long *vcpu_bitmap)
{
- cpumask_var_t cpus;
-
- zalloc_cpumask_var(&cpus, GFP_ATOMIC);
-
- kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
- NULL, vcpu_bitmap, cpus);
-
- free_cpumask_var(cpus);
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
}
void kvm_make_scan_ioapic_request(struct kvm *kvm)
@@ -9250,7 +9385,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
if (!lapic_in_kernel(vcpu))
return;
- mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+ down_read(&vcpu->kvm->arch.apicv_update_lock);
activate = kvm_apicv_activated(vcpu->kvm);
if (vcpu->arch.apicv_active == activate)
@@ -9270,7 +9405,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
- mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
+ up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -9278,6 +9413,8 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
unsigned long old, new;
+ lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
+
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
@@ -9291,6 +9428,18 @@ void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
if (!!old != !!new) {
trace_kvm_apicv_update_request(activate, bit);
+ /*
+ * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
+ * false positives in the sanity check WARN in svm_vcpu_run().
+ * This task will wait for all vCPUs to ack the kick IRQ before
+ * updating apicv_inhibit_reasons, and all other vCPUs will
+ * block on acquiring apicv_update_lock so that vCPUs can't
+ * redo svm_vcpu_run() without seeing the new inhibit state.
+ *
+ * Note, holding apicv_update_lock and taking it in the read
+ * side (handling the request) also prevents other vCPUs from
+ * servicing the request with a stale apicv_inhibit_reasons.
+ */
kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
kvm->arch.apicv_inhibit_reasons = new;
if (new) {
@@ -9304,9 +9453,9 @@ EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
- mutex_lock(&kvm->arch.apicv_update_lock);
+ down_write(&kvm->arch.apicv_update_lock);
__kvm_request_apicv_update(kvm, activate, bit);
- mutex_unlock(&kvm->arch.apicv_update_lock);
+ up_write(&kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
@@ -9418,7 +9567,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
- kvm_gen_update_masterclock(vcpu->kvm);
+ kvm_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
kvm_gen_kvmclock_update(vcpu);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
@@ -9625,18 +9774,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
for (;;) {
+ /*
+ * Assert that vCPU vs. VM APICv state is consistent. An APICv
+ * update must kick and wait for all vCPUs before toggling the
+ * per-VM state, and responsing vCPUs must wait for the update
+ * to complete before servicing KVM_REQ_APICV_UPDATE.
+ */
+ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
exit_fastpath = static_call(kvm_x86_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
- if (unlikely(kvm_vcpu_exit_request(vcpu))) {
+ if (vcpu->arch.apicv_active)
+ static_call(kvm_x86_sync_pir_to_irr)(vcpu);
+
+ if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
-
- if (vcpu->arch.apicv_active)
- static_call(kvm_x86_sync_pir_to_irr)(vcpu);
- }
+ }
/*
* Do this here before restoring debug registers on the host. And
@@ -9899,58 +10056,21 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
-static void kvm_save_current_fpu(struct fpu *fpu)
-{
- /*
- * If the target FPU state is not resident in the CPU registers, just
- * memcpy() from current, else save CPU state directly to the target.
- */
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- memcpy(&fpu->state, &current->thread.fpu.state,
- fpu_kernel_xstate_size);
- else
- save_fpregs_to_fpstate(fpu);
-}
-
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- kvm_save_current_fpu(vcpu->arch.user_fpu);
-
/*
- * Guests with protected state can't have it set by the hypervisor,
- * so skip trying to set it.
+ * Exclude PKRU from restore as restored separately in
+ * kvm_x86_ops.run().
*/
- if (vcpu->arch.guest_fpu)
- /* PKRU is separately restored in kvm_x86_ops.run. */
- __restore_fpregs_from_fpstate(&vcpu->arch.guest_fpu->state,
- ~XFEATURE_MASK_PKRU);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- fpregs_lock();
-
- /*
- * Guests with protected state can't have it read by the hypervisor,
- * so skip trying to save it.
- */
- if (vcpu->arch.guest_fpu)
- kvm_save_current_fpu(vcpu->arch.guest_fpu);
-
- restore_fpregs_from_fpstate(&vcpu->arch.user_fpu->state);
-
- fpregs_mark_activate();
- fpregs_unlock();
-
+ fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
@@ -10531,12 +10651,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
fpu->fsw = fxsave->swd;
@@ -10554,12 +10674,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave;
- if (!vcpu->arch.guest_fpu)
+ if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
return 0;
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu->state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -10610,33 +10730,6 @@ static int sync_regs(struct kvm_vcpu *vcpu)
return 0;
}
-static void fx_init(struct kvm_vcpu *vcpu)
-{
- if (!vcpu->arch.guest_fpu)
- return;
-
- fpstate_init(&vcpu->arch.guest_fpu->state);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
- host_xcr0 | XSTATE_COMPACTION_ENABLED;
-
- /*
- * Ensure guest xcr0 is valid for loading
- */
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
-
- vcpu->arch.cr0 |= X86_CR0_ET;
-}
-
-void kvm_free_guest_fpu(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.guest_fpu) {
- kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
- vcpu->arch.guest_fpu = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_free_guest_fpu);
-
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
@@ -10652,6 +10745,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
int r;
vcpu->arch.last_vmentry_cpu = -1;
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -10691,20 +10786,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
if (!alloc_emulate_ctxt(vcpu))
goto free_wbinvd_dirty_mask;
- vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.user_fpu) {
- pr_err("kvm: failed to allocate userspace's fpu\n");
- goto free_emulate_ctxt;
- }
-
- vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.guest_fpu) {
+ if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
pr_err("kvm: failed to allocate vcpu's fpu\n");
- goto free_user_fpu;
+ goto free_emulate_ctxt;
}
- fx_init(vcpu);
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
@@ -10736,9 +10821,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
return 0;
free_guest_fpu:
- kvm_free_guest_fpu(vcpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
free_emulate_ctxt:
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_wbinvd_dirty_mask:
@@ -10787,8 +10870,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
- kvm_free_guest_fpu(vcpu);
+ fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
@@ -10805,9 +10887,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ struct kvm_cpuid_entry2 *cpuid_0x1;
unsigned long old_cr0 = kvm_read_cr0(vcpu);
unsigned long new_cr0;
- u32 eax, dummy;
+
+ /*
+ * Several of the "set" flows, e.g. ->set_cr0(), read other registers
+ * to handle side effects. RESET emulation hits those flows and relies
+ * on emulated/virtualized registers, including those that are loaded
+ * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel
+ * to detect improper or missing initialization.
+ */
+ WARN_ON_ONCE(!init_event &&
+ (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
kvm_lapic_reset(vcpu, init_event);
@@ -10840,8 +10932,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- if (vcpu->arch.guest_fpu && kvm_mpx_supported()) {
- void *mpx_state_buffer;
+ if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
+ struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
/*
* To avoid have the INIT path from kvm_apic_has_events() that be
@@ -10849,14 +10941,10 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDREGS);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
- XFEATURE_BNDCSR);
- if (mpx_state_buffer)
- memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
+ fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
+
if (init_event)
kvm_load_guest_fpu(vcpu);
}
@@ -10870,21 +10958,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
+ /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+ kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
/*
* Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
* if no CPUID match is found. Note, it's impossible to get a match at
* RESET since KVM emulates RESET before exposing the vCPU to userspace,
- * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
- * But, go through the motions in case that's ever remedied.
+ * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
+ * on RESET. But, go through the motions in case that's ever remedied.
*/
- eax = 1;
- if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
- eax = 0x600;
- kvm_rdx_write(vcpu, eax);
+ cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
+ kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
vcpu->arch.ia32_xss = 0;
@@ -10893,6 +10979,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
+ vcpu->arch.cr3 = 0;
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
+
/*
* CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
* of Intel's SDM list CD/NW as being set on INIT, but they contradict
@@ -11133,15 +11222,22 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_free_vm(struct kvm *kvm)
{
kfree(to_kvm_hv(kvm)->hv_pa_pg);
- vfree(kvm);
+ __kvm_arch_free_vm(kvm);
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ int ret;
+ unsigned long flags;
+
if (type)
return -EINVAL;
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ return ret;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
@@ -11157,10 +11253,12 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
- spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
-
+ seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
+
+ raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
pvclock_update_vm_gtod_copy(kvm);
+ raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -11174,7 +11272,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
- kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
kvm_xen_init_vm(kvm);
@@ -11358,8 +11455,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
- unsigned long npages)
+int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
{
const int sz = sizeof(*slot->arch.rmap[0]);
int i;
@@ -11368,7 +11464,8 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
int level = i + 1;
int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
- WARN_ON(slot->arch.rmap[i]);
+ if (slot->arch.rmap[i])
+ continue;
slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i]) {
@@ -11380,50 +11477,6 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
return 0;
}
-int alloc_all_memslots_rmaps(struct kvm *kvm)
-{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
- int r, i;
-
- /*
- * Check if memslots alreday have rmaps early before acquiring
- * the slots_arch_lock below.
- */
- if (kvm_memslots_have_rmaps(kvm))
- return 0;
-
- mutex_lock(&kvm->slots_arch_lock);
-
- /*
- * Read memslots_have_rmaps again, under the slots arch lock,
- * before allocating the rmaps
- */
- if (kvm_memslots_have_rmaps(kvm)) {
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
- }
-
- for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
- slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(slot, slots) {
- r = memslot_rmap_alloc(slot, slot->npages);
- if (r) {
- mutex_unlock(&kvm->slots_arch_lock);
- return r;
- }
- }
- }
-
- /*
- * Ensure that memslots_have_rmaps becomes true strictly after
- * all the rmap pointers are set.
- */
- smp_store_release(&kvm->arch.memslots_have_rmaps, true);
- mutex_unlock(&kvm->slots_arch_lock);
- return 0;
-}
-
static int kvm_alloc_memslot_metadata(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages)
@@ -11474,7 +11527,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
}
}
- if (kvm_page_track_create_memslot(slot, npages))
+ if (kvm_page_track_create_memslot(kvm, slot, npages))
goto out_free;
return 0;
@@ -12072,6 +12125,15 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
return static_call(kvm_x86_update_pi_irte)(kvm, host_irq, guest_irq, set);
}
+bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
+ struct kvm_kernel_irq_routing_entry *new)
+{
+ if (new->type != KVM_IRQ_ROUTING_MSI)
+ return true;
+
+ return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
+}
+
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
@@ -12153,9 +12215,7 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
* doesn't seem to be a real use-case behind such requests, just return
* KVM_EXIT_INTERNAL_ERROR for now.
*/
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -12343,44 +12403,81 @@ int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
}
EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
-static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port);
+
+static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
{
- memcpy(vcpu->arch.guest_ins_data, vcpu->arch.pio_data,
- vcpu->arch.pio.count * vcpu->arch.pio.size);
- vcpu->arch.pio.count = 0;
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+ vcpu->arch.pio.count = 0;
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_outs(vcpu, size, port);
return 1;
}
static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port)
{
- int ret;
-
- ret = emulator_pio_out_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret)
- return ret;
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
+
+ /* memcpy done already by emulator_pio_out. */
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+ if (!ret)
+ break;
- vcpu->arch.pio.count = 0;
+ /* Emulation done by the kernel. */
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
+ }
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
return 0;
}
static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
- unsigned int port, void *data, unsigned int count)
+ unsigned int port);
+
+static void advance_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
{
- int ret;
+ unsigned count = vcpu->arch.pio.count;
+ complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
+ vcpu->arch.sev_pio_count -= count;
+ vcpu->arch.sev_pio_data += count * vcpu->arch.pio.size;
+}
- ret = emulator_pio_in_emulated(vcpu->arch.emulate_ctxt, size, port,
- data, count);
- if (ret) {
- vcpu->arch.pio.count = 0;
- } else {
- vcpu->arch.guest_ins_data = data;
- vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
+static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
+{
+ int size = vcpu->arch.pio.size;
+ int port = vcpu->arch.pio.port;
+
+ advance_sev_es_emulated_ins(vcpu);
+ if (vcpu->arch.sev_pio_count)
+ return kvm_sev_es_ins(vcpu, size, port);
+ return 1;
+}
+
+static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
+ unsigned int port)
+{
+ for (;;) {
+ unsigned int count =
+ min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
+ if (!__emulator_pio_in(vcpu, size, port, count))
+ break;
+
+ /* Emulation done by the kernel. */
+ advance_sev_es_emulated_ins(vcpu);
+ if (!vcpu->arch.sev_pio_count)
+ return 1;
}
+ vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
return 0;
}
@@ -12388,8 +12485,10 @@ int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
unsigned int port, void *data, unsigned int count,
int in)
{
- return in ? kvm_sev_es_ins(vcpu, size, port, data, count)
- : kvm_sev_es_outs(vcpu, size, port, data, count);
+ vcpu->arch.sev_pio_data = data;
+ vcpu->arch.sev_pio_count = count;
+ return in ? kvm_sev_es_ins(vcpu, size, port)
+ : kvm_sev_es_outs(vcpu, size, port);
}
EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7d66d63dc55a..ea264c4502e4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -343,8 +343,6 @@ extern bool enable_vmware_backdoor;
extern int pi_inject_timer;
-extern struct static_key kvm_no_apic_vcpu;
-
extern bool report_ignored_msrs;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 9ea9c3dabe37..8f62baebd028 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -190,6 +190,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
{
+ int err;
u8 rc = 0;
/*
@@ -216,13 +217,29 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
if (likely(slots->generation == ghc->generation &&
!kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
/* Fast path */
- __get_user(rc, (u8 __user *)ghc->hva + offset);
- } else {
- /* Slow path */
- kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
- sizeof(rc));
+ pagefault_disable();
+ err = __get_user(rc, (u8 __user *)ghc->hva + offset);
+ pagefault_enable();
+ if (!err)
+ return rc;
}
+ /* Slow path */
+
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
+
+ kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
+ sizeof(rc));
+
return rc;
}