summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/Makefile9
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/mmu/mmu.c270
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h28
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h2
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h14
-rw-r--r--arch/x86/kvm/mmu/spte.c40
-rw-r--r--arch/x86/kvm/mmu/spte.h26
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c64
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h1
-rw-r--r--arch/x86/kvm/svm/sev.c343
-rw-r--r--arch/x86/kvm/svm/svm.c36
-rw-r--r--arch/x86/kvm/svm/svm.h56
-rw-r--r--arch/x86/kvm/vmx/main.c166
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmx.c438
-rw-r--r--arch/x86/kvm/vmx/vmx.h6
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h124
-rw-r--r--arch/x86/kvm/x86.c234
-rw-r--r--arch/x86/kvm/x86.h2
22 files changed, 1180 insertions, 704 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 0ebdd088f28b..d64fb2b3eb69 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -95,6 +95,19 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config KVM_INTEL_PROVE_VE
+ bool "Check that guests do not receive #VE exceptions"
+ default KVM_PROVE_MMU || DEBUG_KERNEL
+ depends on KVM_INTEL
+ help
+
+ Checks that KVM's page table management code will not incorrectly
+ let guests receive a virtualization exception. Virtualization
+ exceptions will be trapped by the hypervisor rather than injected
+ in the guest.
+
+ If unsure, say N.
+
config X86_SGX_KVM
bool "Software Guard eXtensions (SGX) Virtualization"
depends on X86_SGX && KVM_INTEL
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index addc44fc7187..5494669a055a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,14 +16,15 @@ kvm-$(CONFIG_KVM_XEN) += xen.o
kvm-$(CONFIG_KVM_SMM) += smm.o
kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \
- vmx/nested.o vmx/posted_intr.o
+ vmx/nested.o vmx/posted_intr.o vmx/main.o
kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o
kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o
-kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o \
- svm/sev.o
-kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
+kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o
+
+kvm-amd-$(CONFIG_KVM_AMD_SEV) += svm/sev.o
+kvm-amd-$(CONFIG_KVM_HYPERV) += svm/hyperv.o
ifdef CONFIG_HYPERV
kvm-y += kvm_onhyperv.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 77352a4abd87..1851b3870a9c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -772,7 +772,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0);
kvm_cpu_cap_mask(CPUID_8000_001F_EAX,
- 0 /* SME */ | F(SEV) | 0 /* VM_PAGE_FLUSH */ | F(SEV_ES) |
+ 0 /* SME */ | 0 /* SEV */ | 0 /* VM_PAGE_FLUSH */ | 0 /* SEV_ES */ |
F(SME_COHERENT));
kvm_cpu_cap_mask(CPUID_8000_0021_EAX,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 60f21bb4c27b..2343c9f00e31 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -213,7 +213,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
*/
u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
- int index = (pfec + (not_smap << PFERR_RSVD_BIT)) >> 1;
+ int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
u32 errcode = PFERR_PRESENT_MASK;
bool fault;
@@ -234,8 +234,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
- offset = (pfec & ~1) +
- ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+ offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
pkru_bits &= mmu->pkru_mask >> offset;
errcode |= -pkru_bits & PFERR_PK_MASK;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index db007a4dffa2..c39674d19260 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -432,8 +432,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
* The idea using the light way get the spte on x86_32 guest is from
* gup_get_pte (mm/gup.c).
*
- * An spte tlb flush may be pending, because kvm_set_pte_rmap
- * coalesces them and we are running out of the MMU lock. Therefore
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock. Therefore
* we need to protect against in-progress updates of the spte.
*
* Reading the spte while an update is in progress may get the old value
@@ -567,9 +567,9 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
if (!is_shadow_present_pte(old_spte) ||
!spte_has_volatile_bits(old_spte))
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
else
- old_spte = __update_clear_spte_slow(sptep, 0ull);
+ old_spte = __update_clear_spte_slow(sptep, SHADOW_NONPRESENT_VALUE);
if (!is_shadow_present_pte(old_spte))
return old_spte;
@@ -603,7 +603,7 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __update_clear_spte_fast(sptep, 0ull);
+ __update_clear_spte_fast(sptep, SHADOW_NONPRESENT_VALUE);
}
static u64 mmu_spte_get_lockless(u64 *sptep)
@@ -1448,49 +1448,11 @@ static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
}
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
return __kvm_zap_rmap(kvm, rmap_head, slot);
}
-static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t pte)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- bool need_flush = false;
- u64 new_spte;
- kvm_pfn_t new_pfn;
-
- WARN_ON_ONCE(pte_huge(pte));
- new_pfn = pte_pfn(pte);
-
-restart:
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- need_flush = true;
-
- if (pte_write(pte)) {
- kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
- goto restart;
- } else {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(
- *sptep, new_pfn);
-
- mmu_spte_clear_track_bits(kvm, sptep);
- mmu_spte_set(sptep, new_spte);
- }
- }
-
- if (need_flush && kvm_available_flush_remote_tlbs_range()) {
- kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
- return false;
- }
-
- return need_flush;
-}
-
struct slot_rmap_walk_iterator {
/* input fields. */
const struct kvm_memory_slot *slot;
@@ -1562,7 +1524,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t pte);
+ int level);
static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
struct kvm_gfn_range *range,
@@ -1574,7 +1536,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
range->start, range->end - 1, &iterator)
ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level, range->arg.pte);
+ iterator.level);
return ret;
}
@@ -1596,22 +1558,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return flush;
}
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- bool flush = false;
-
- if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
-
- if (tdp_mmu_enabled)
- flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
-
- return flush;
-}
-
static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1624,8 +1572,7 @@ static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
}
static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level, pte_t unused)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1950,7 +1897,8 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
static int kvm_sync_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i)
{
- if (!sp->spt[i])
+ /* sp->spt[i] has initial value of shadow page table allocation */
+ if (sp->spt[i] == SHADOW_NONPRESENT_VALUE)
return 0;
return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
@@ -2514,7 +2462,7 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
return kvm_mmu_prepare_zap_page(kvm, child,
invalid_list);
}
- } else if (is_mmio_spte(pte)) {
+ } else if (is_mmio_spte(kvm, pte)) {
mmu_spte_clear_no_track(spte);
}
return 0;
@@ -3314,9 +3262,19 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
{
gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ if (fault->is_private) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
access & shadow_mmio_access_mask);
+ fault->slot = NULL;
+ fault->pfn = KVM_PFN_NOSLOT;
+ fault->map_writable = false;
+ fault->hva = KVM_HVA_ERR_BAD;
+
/*
* If MMIO caching is disabled, emulate immediately without
* touching the shadow page tables as attempting to install an
@@ -4196,7 +4154,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (WARN_ON_ONCE(reserved))
return -EINVAL;
- if (is_mmio_spte(spte)) {
+ if (is_mmio_spte(vcpu->kvm, spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned int access = get_mmio_spte_access(spte);
@@ -4259,24 +4217,28 @@ static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
return (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
}
-static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- gfn_t gfn)
+static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
{
struct kvm_arch_async_pf arch;
arch.token = alloc_apf_token(vcpu);
- arch.gfn = gfn;
+ arch.gfn = fault->gfn;
+ arch.error_code = fault->error_code;
arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu);
- return kvm_setup_async_pf(vcpu, cr2_or_gpa,
- kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, fault->addr,
+ kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
}
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
+ if (WARN_ON_ONCE(work->arch.error_code & PFERR_PRIVATE_ACCESS))
+ return;
+
if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -4289,7 +4251,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu))
return;
- kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code, true, NULL);
}
static inline u8 kvm_max_level_for_order(int order)
@@ -4309,14 +4271,6 @@ static inline u8 kvm_max_level_for_order(int order)
return PG_LEVEL_4K;
}
-static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault)
-{
- kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
- PAGE_SIZE, fault->write, fault->exec,
- fault->is_private);
-}
-
static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
struct kvm_page_fault *fault)
{
@@ -4343,48 +4297,15 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- struct kvm_memory_slot *slot = fault->slot;
bool async;
- /*
- * Retry the page fault if the gfn hit a memslot that is being deleted
- * or moved. This ensures any existing SPTEs for the old memslot will
- * be zapped before KVM inserts a new MMIO SPTE for the gfn.
- */
- if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return RET_PF_RETRY;
-
- if (!kvm_is_visible_memslot(slot)) {
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu)) {
- fault->slot = NULL;
- fault->pfn = KVM_PFN_NOSLOT;
- fault->map_writable = false;
- return RET_PF_CONTINUE;
- }
- /*
- * If the APIC access page exists but is disabled, go directly
- * to emulation without caching the MMIO access or creating a
- * MMIO SPTE. That way the cache doesn't need to be purged
- * when the AVIC is re-enabled.
- */
- if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
- !kvm_apicv_activated(vcpu->kvm))
- return RET_PF_EMULATE;
- }
-
- if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
- kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
- return -EFAULT;
- }
-
if (fault->is_private)
return kvm_faultin_pfn_private(vcpu, fault);
async = false;
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false,
+ &async, fault->write,
+ &fault->map_writable, &fault->hva);
if (!async)
return RET_PF_CONTINUE; /* *pfn has correct page already */
@@ -4394,7 +4315,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
trace_kvm_async_pf_repeated_fault(fault->addr, fault->gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return RET_PF_RETRY;
- } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) {
+ } else if (kvm_arch_setup_async_pf(vcpu, fault)) {
return RET_PF_RETRY;
}
}
@@ -4404,17 +4325,72 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
* to wait for IO. Note, gup always bails if it is unable to quickly
* get a page and a fatal signal, i.e. SIGKILL, is pending.
*/
- fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, true, NULL,
- fault->write, &fault->map_writable,
- &fault->hva);
+ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true,
+ NULL, fault->write,
+ &fault->map_writable, &fault->hva);
return RET_PF_CONTINUE;
}
static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
unsigned int access)
{
+ struct kvm_memory_slot *slot = fault->slot;
int ret;
+ /*
+ * Note that the mmu_invalidate_seq also serves to detect a concurrent
+ * change in attributes. is_page_fault_stale() will detect an
+ * invalidation relate to fault->fn and resume the guest without
+ * installing a mapping in the page tables.
+ */
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ /*
+ * Now that we have a snapshot of mmu_invalidate_seq we can check for a
+ * private vs. shared mismatch.
+ */
+ if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) {
+ kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+ return -EFAULT;
+ }
+
+ if (unlikely(!slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * Retry the page fault if the gfn hit a memslot that is being deleted
+ * or moved. This ensures any existing SPTEs for the old memslot will
+ * be zapped before KVM inserts a new MMIO SPTE for the gfn.
+ */
+ if (slot->flags & KVM_MEMSLOT_INVALID)
+ return RET_PF_RETRY;
+
+ if (slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) {
+ /*
+ * Don't map L1's APIC access page into L2, KVM doesn't support
+ * using APICv/AVIC to accelerate L2 accesses to L1's APIC,
+ * i.e. the access needs to be emulated. Emulating access to
+ * L1's APIC is also correct if L1 is accelerating L2's own
+ * virtual APIC, but for some reason L1 also maps _L1's_ APIC
+ * into L2. Note, vcpu_is_mmio_gpa() always treats access to
+ * the APIC as MMIO. Allow an MMIO SPTE to be created, as KVM
+ * uses different roots for L1 vs. L2, i.e. there is no danger
+ * of breaking APICv/AVIC for L1.
+ */
+ if (is_guest_mode(vcpu))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (!kvm_apicv_activated(vcpu->kvm))
+ return RET_PF_EMULATE;
+ }
+
fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
@@ -4439,8 +4415,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* *guaranteed* to need to retry, i.e. waiting until mmu_lock is held
* to detect retry guarantees the worst case latency for the vCPU.
*/
- if (fault->slot &&
- mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
+ if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn))
return RET_PF_RETRY;
ret = __kvm_faultin_pfn(vcpu, fault);
@@ -4450,7 +4425,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (unlikely(is_error_pfn(fault->pfn)))
return kvm_handle_error_pfn(vcpu, fault);
- if (unlikely(!fault->slot))
+ if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn)))
return kvm_handle_noslot_fault(vcpu, fault, access);
/*
@@ -4561,6 +4536,16 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (WARN_ON_ONCE(fault_address >> 32))
return -EFAULT;
#endif
+ /*
+ * Legacy #PF exception only have a 32-bit error code. Simply drop the
+ * upper bits as KVM doesn't use them for #PF (because they are never
+ * set), and to ensure there are no collisions with KVM-defined bits.
+ */
+ if (WARN_ON_ONCE(error_code >> 32))
+ error_code = lower_32_bits(error_code);
+
+ /* Ensure the above sanity check also covers KVM-defined flags. */
+ BUILD_BUG_ON(lower_32_bits(PFERR_SYNTHETIC_MASK));
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
@@ -4812,7 +4797,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned int access)
{
- if (unlikely(is_mmio_spte(*sptep))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, *sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
mmu_spte_clear_no_track(sptep);
return true;
@@ -5846,30 +5831,35 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
int r, emulation_type = EMULTYPE_PF;
bool direct = vcpu->arch.mmu->root_role.direct;
- /*
- * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
- * checks when emulating instructions that triggers implicit access.
- * WARN if hardware generates a fault with an error code that collides
- * with the KVM-defined value. Clear the flag and continue on, i.e.
- * don't terminate the VM, as KVM can't possibly be relying on a flag
- * that KVM doesn't know about.
- */
- if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
- error_code &= ~PFERR_IMPLICIT_ACCESS;
-
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
+ /*
+ * Except for reserved faults (emulated MMIO is shared-only), set the
+ * PFERR_PRIVATE_ACCESS flag for software-protected VMs based on the gfn's
+ * current attributes, which are the source of truth for such VMs. Note,
+ * this wrong for nested MMUs as the GPA is an L2 GPA, but KVM doesn't
+ * currently supported nested virtualization (among many other things)
+ * for software-protected VMs.
+ */
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ !(error_code & PFERR_RSVD_MASK) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM &&
+ kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)))
+ error_code |= PFERR_PRIVATE_ACCESS;
+
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ if (WARN_ON_ONCE(error_code & PFERR_PRIVATE_ACCESS))
+ return -EFAULT;
+
r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
- lower_32_bits(error_code), false,
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, error_code, false,
&emulation_type);
if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
@@ -6173,7 +6163,10 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
- vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
+ vcpu->arch.mmu_shadow_page_cache.init_value =
+ SHADOW_NONPRESENT_VALUE;
+ if (!vcpu->arch.mmu_shadow_page_cache.init_value)
+ vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
vcpu->arch.mmu = &vcpu->arch.root_mmu;
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
@@ -6316,6 +6309,7 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
void kvm_mmu_init_vm(struct kvm *kvm)
{
+ kvm->arch.shadow_mmio_value = shadow_mmio_value;
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 5390a591a571..ce2fcd19ba6b 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -190,7 +190,7 @@ static inline bool is_nx_huge_page_enabled(struct kvm *kvm)
struct kvm_page_fault {
/* arguments to kvm_mmu_do_page_fault. */
const gpa_t addr;
- const u32 error_code;
+ const u64 error_code;
const bool prefetch;
/* Derived from error_code. */
@@ -279,8 +279,16 @@ enum {
RET_PF_SPURIOUS,
};
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
+ PAGE_SIZE, fault->write, fault->exec,
+ fault->is_private);
+}
+
static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 err, bool prefetch, int *emulation_type)
+ u64 err, bool prefetch, int *emulation_type)
{
struct kvm_page_fault fault = {
.addr = cr2_or_gpa,
@@ -298,7 +306,10 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
.max_level = KVM_MAX_HUGEPAGE_LEVEL,
.req_level = PG_LEVEL_4K,
.goal_level = PG_LEVEL_4K,
- .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT),
+ .is_private = err & PFERR_PRIVATE_ACCESS,
+
+ .pfn = KVM_PFN_ERR_FAULT,
+ .hva = KVM_HVA_ERR_BAD,
};
int r;
@@ -320,6 +331,17 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
else
r = vcpu->arch.mmu->page_fault(vcpu, &fault);
+ /*
+ * Not sure what's happening, but punt to userspace and hope that
+ * they can fix it by changing memory to shared, or they can
+ * provide a better error.
+ */
+ if (r == RET_PF_EMULATE && fault.is_private) {
+ pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
+ kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+ return -EFAULT;
+ }
+
if (fault.write_fault_to_shadow_pgtable && emulation_type)
*emulation_type |= EMULTYPE_WRITE_PF_TO_SP;
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index ae86820cef69..195d98bc8de8 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -260,7 +260,7 @@ TRACE_EVENT(
TP_STRUCT__entry(
__field(int, vcpu_id)
__field(gpa_t, cr2_or_gpa)
- __field(u32, error_code)
+ __field(u64, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
__field(u64, new_spte)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 4d4e98fe4f35..9aac3aa93d88 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -911,7 +911,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
gpa_t pte_gpa;
gfn_t gfn;
- if (WARN_ON_ONCE(!sp->spt[i]))
+ if (WARN_ON_ONCE(sp->spt[i] == SHADOW_NONPRESENT_VALUE))
return 0;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
@@ -933,13 +933,13 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
return 0;
/*
- * Drop the SPTE if the new protections would result in a RWX=0
- * SPTE or if the gfn is changing. The RWX=0 case only affects
- * EPT with execute-only support, i.e. EPT without an effective
- * "present" bit, as all other paging modes will create a
- * read-only SPTE if pte_access is zero.
+ * Drop the SPTE if the new protections result in no effective
+ * "present" bit or if the gfn is changing. The former case
+ * only affects EPT with execute-only support with pte_access==0;
+ * all other paging modes will create a read-only SPTE if
+ * pte_access is zero.
*/
- if ((!pte_access && !shadow_present_mask) ||
+ if ((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE ||
gfn != kvm_mmu_page_get_gfn(sp, i)) {
drop_spte(vcpu->kvm, &sp->spt[i]);
return 1;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4a599130e9c9..a5e014d7bc62 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -74,10 +74,10 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
u64 spte = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
- WARN_ON_ONCE(!shadow_mmio_value);
+ WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value);
access &= shadow_mmio_access_mask;
- spte |= shadow_mmio_value | access;
+ spte |= vcpu->kvm->arch.shadow_mmio_value | access;
spte |= gpa | shadow_nonpresent_or_rsvd_mask;
spte |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< SHADOW_NONPRESENT_OR_RSVD_MASK_LEN;
@@ -144,19 +144,19 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 spte = SPTE_MMU_PRESENT_MASK;
bool wrprot = false;
- WARN_ON_ONCE(!pte_access && !shadow_present_mask);
+ /*
+ * For the EPT case, shadow_present_mask has no RWX bits set if
+ * exec-only page table entries are supported. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE);
if (sp->role.ad_disabled)
spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
spte |= SPTE_TDP_AD_WRPROT_ONLY;
- /*
- * For the EPT case, shadow_present_mask is 0 if hardware
- * supports exec-only page table entries. In that case,
- * ACC_USER_MASK and shadow_user_mask are used to represent
- * read access. See FNAME(gpte_access) in paging_tmpl.h.
- */
spte |= shadow_present_mask;
if (!prefetch)
spte |= spte_shadow_accessed_mask(spte);
@@ -322,22 +322,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
-{
- u64 new_spte;
-
- new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
- new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
- new_spte &= ~PT_WRITABLE_MASK;
- new_spte &= ~shadow_host_writable_mask;
- new_spte &= ~shadow_mmu_writable_mask;
-
- new_spte = mark_spte_for_access_track(new_spte);
-
- return new_spte;
-}
-
u64 mark_spte_for_access_track(u64 spte)
{
if (spte_ad_enabled(spte))
@@ -429,7 +413,9 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull;
shadow_nx_mask = 0ull;
shadow_x_mask = VMX_EPT_EXECUTABLE_MASK;
- shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK;
+ /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
+ shadow_present_mask =
+ (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
/*
* EPT overrides the host MTRRs, and so KVM must program the desired
* memtype directly into the SPTEs. Note, this mask is just the mask
@@ -446,7 +432,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
* of an EPT paging-structure entry is 110b (write/execute).
*/
kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
- VMX_EPT_RWX_MASK, 0);
+ VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a129951c9a88..5dd5405fa07a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -149,6 +149,22 @@ static_assert(MMIO_SPTE_GEN_LOW_BITS == 8 && MMIO_SPTE_GEN_HIGH_BITS == 11);
#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+/*
+ * Non-present SPTE value needs to set bit 63 for TDX, in order to suppress
+ * #VE and get EPT violations on non-present PTEs. We can use the
+ * same value also without TDX for both VMX and SVM:
+ *
+ * For SVM NPT, for non-present spte (bit 0 = 0), other bits are ignored.
+ * For VMX EPT, bit 63 is ignored if #VE is disabled. (EPT_VIOLATION_VE=0)
+ * bit 63 is #VE suppress if #VE is enabled. (EPT_VIOLATION_VE=1)
+ */
+#ifdef CONFIG_X86_64
+#define SHADOW_NONPRESENT_VALUE BIT_ULL(63)
+static_assert(!(SHADOW_NONPRESENT_VALUE & SPTE_MMU_PRESENT_MASK));
+#else
+#define SHADOW_NONPRESENT_VALUE 0ULL
+#endif
+
extern u64 __read_mostly shadow_host_writable_mask;
extern u64 __read_mostly shadow_mmu_writable_mask;
extern u64 __read_mostly shadow_nx_mask;
@@ -190,11 +206,11 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* both AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
- * vulnerability. Use only low bits to avoid 64-bit immediates.
+ * vulnerability.
*
* Only used by the TDP MMU.
*/
-#define REMOVED_SPTE 0x5a0ULL
+#define REMOVED_SPTE (SHADOW_NONPRESENT_VALUE | 0x5a0ULL)
/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
@@ -249,9 +265,9 @@ static inline struct kvm_mmu_page *root_to_sp(hpa_t root)
return spte_to_child_sp(root);
}
-static inline bool is_mmio_spte(u64 spte)
+static inline bool is_mmio_spte(struct kvm *kvm, u64 spte)
{
- return (spte & shadow_mmio_mask) == shadow_mmio_value &&
+ return (spte & shadow_mmio_mask) == kvm->arch.shadow_mmio_value &&
likely(enable_mmio_caching);
}
@@ -496,8 +512,6 @@ static inline u64 restore_acc_track_spte(u64 spte)
return spte;
}
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
-
void __init kvm_mmu_spte_module_init(void);
void kvm_mmu_reset_all_pte_masks(void);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 04c1f0957fea..b5be7e949bd8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -495,8 +495,8 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* impact the guest since both the former and current SPTEs
* are nonpresent.
*/
- if (WARN_ON_ONCE(!is_mmio_spte(old_spte) &&
- !is_mmio_spte(new_spte) &&
+ if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
+ !is_mmio_spte(kvm, new_spte) &&
!is_removed_spte(new_spte)))
pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
"should not be replaced with another,\n"
@@ -603,7 +603,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* here since the SPTE is going from non-present to non-present. Use
* the raw write helper to avoid an unnecessary check on volatile bits.
*/
- __kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, SHADOW_NONPRESENT_VALUE);
return 0;
}
@@ -740,8 +740,8 @@ retry:
continue;
if (!shared)
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
- else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
+ else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
goto retry;
}
}
@@ -808,8 +808,8 @@ bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
return false;
- tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
- sp->gfn, sp->role.level + 1);
+ tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
+ SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
return true;
}
@@ -843,7 +843,7 @@ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- tdp_mmu_iter_set_spte(kvm, &iter, 0);
+ tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
/*
* Zappings SPTEs in invalid roots doesn't require a TLB flush,
@@ -1028,7 +1028,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
}
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
- if (unlikely(is_mmio_spte(new_spte))) {
+ if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
vcpu->stat.pf_mmio_spte_created++;
trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
new_spte);
@@ -1258,52 +1258,6 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
}
-static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
- struct kvm_gfn_range *range)
-{
- u64 new_spte;
-
- /* Huge pages aren't expected to be modified without first being zapped. */
- WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
-
- if (iter->level != PG_LEVEL_4K ||
- !is_shadow_present_pte(iter->old_spte))
- return false;
-
- /*
- * Note, when changing a read-only SPTE, it's not strictly necessary to
- * zero the SPTE before setting the new PFN, but doing so preserves the
- * invariant that the PFN of a present * leaf SPTE can never change.
- * See handle_changed_spte().
- */
- tdp_mmu_iter_set_spte(kvm, iter, 0);
-
- if (!pte_write(range->arg.pte)) {
- new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
- pte_pfn(range->arg.pte));
-
- tdp_mmu_iter_set_spte(kvm, iter, new_spte);
- }
-
- return true;
-}
-
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
- /*
- * No need to handle the remote TLB flush under RCU protection, the
- * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
- * shadow page. See the WARN on pfn_changed in handle_changed_spte().
- */
- return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-}
-
/*
* Remove write access from all SPTEs at or above min_level that map GFNs
* [start, end). Returns true if an SPTE has been changed and the TLBs need to
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 6e1ea04ca885..58b55e61bd33 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
bool flush);
bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
const struct kvm_memory_slot *slot, int min_level);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 759581bb2128..0623cfaa7bb0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -23,6 +23,7 @@
#include <asm/pkru.h>
#include <asm/trapnr.h>
#include <asm/fpu/xcr.h>
+#include <asm/fpu/xstate.h>
#include <asm/debugreg.h>
#include "mmu.h"
@@ -32,22 +33,12 @@
#include "cpuid.h"
#include "trace.h"
-#ifndef CONFIG_KVM_AMD_SEV
-/*
- * When this config is not defined, SEV feature is not supported and APIs in
- * this file are not used but this file still gets compiled into the KVM AMD
- * module.
- *
- * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
- * misc_res_type {} defined in linux/misc_cgroup.h.
- *
- * Below macros allow compilation to succeed.
- */
-#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
-#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
-#endif
+#define GHCB_VERSION_MAX 2ULL
+#define GHCB_VERSION_DEFAULT 2ULL
+#define GHCB_VERSION_MIN 1ULL
+
+#define GHCB_HV_FT_SUPPORTED GHCB_HV_FT_SNP
-#ifdef CONFIG_KVM_AMD_SEV
/* enable/disable SEV support */
static bool sev_enabled = true;
module_param_named(sev, sev_enabled, bool, 0444);
@@ -57,13 +48,13 @@ static bool sev_es_enabled = true;
module_param_named(sev_es, sev_es_enabled, bool, 0444);
/* enable/disable SEV-ES DebugSwap support */
-static bool sev_es_debug_swap_enabled = false;
+static bool sev_es_debug_swap_enabled = true;
module_param_named(debug_swap, sev_es_debug_swap_enabled, bool, 0444);
-#else
-#define sev_enabled false
-#define sev_es_enabled false
-#define sev_es_debug_swap_enabled false
-#endif /* CONFIG_KVM_AMD_SEV */
+static u64 sev_supported_vmsa_features;
+
+#define AP_RESET_HOLD_NONE 0
+#define AP_RESET_HOLD_NAE_EVENT 1
+#define AP_RESET_HOLD_MSR_PROTO 2
static u8 sev_enc_bit;
static DECLARE_RWSEM(sev_deactivate_lock);
@@ -113,7 +104,15 @@ static int sev_flush_asids(unsigned int min_asid, unsigned int max_asid)
static inline bool is_mirroring_enc_context(struct kvm *kvm)
{
- return !!to_kvm_svm(kvm)->sev_info.enc_context_owner;
+ return !!to_kvm_sev_info(kvm)->enc_context_owner;
+}
+
+static bool sev_vcpu_has_debug_swap(struct vcpu_svm *svm)
+{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
+ return sev->vmsa_features & SVM_SEV_FEAT_DEBUG_SWAP;
}
/* Must be called with the sev_bitmap_lock held */
@@ -251,20 +250,44 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
sev_decommission(handle);
}
-static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+static int __sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp,
+ struct kvm_sev_init *data,
+ unsigned long vm_type)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_platform_init_args init_args = {0};
+ bool es_active = vm_type != KVM_X86_SEV_VM;
+ u64 valid_vmsa_features = es_active ? sev_supported_vmsa_features : 0;
int ret;
if (kvm->created_vcpus)
return -EINVAL;
+ if (data->flags)
+ return -EINVAL;
+
+ if (data->vmsa_features & ~valid_vmsa_features)
+ return -EINVAL;
+
+ if (data->ghcb_version > GHCB_VERSION_MAX || (!es_active && data->ghcb_version))
+ return -EINVAL;
+
if (unlikely(sev->active))
return -EINVAL;
sev->active = true;
- sev->es_active = argp->id == KVM_SEV_ES_INIT;
+ sev->es_active = es_active;
+ sev->vmsa_features = data->vmsa_features;
+ sev->ghcb_version = data->ghcb_version;
+
+ /*
+ * Currently KVM supports the full range of mandatory features defined
+ * by version 2 of the GHCB protocol, so default to that for SEV-ES
+ * guests created via KVM_SEV_INIT2.
+ */
+ if (sev->es_active && !sev->ghcb_version)
+ sev->ghcb_version = GHCB_VERSION_DEFAULT;
+
ret = sev_asid_new(sev);
if (ret)
goto e_no_asid;
@@ -276,6 +299,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
+ sev->need_init = false;
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_SEV);
@@ -286,11 +310,53 @@ e_free:
sev_asid_free(sev);
sev->asid = 0;
e_no_asid:
+ sev->vmsa_features = 0;
sev->es_active = false;
sev->active = false;
return ret;
}
+static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_init data = {
+ .vmsa_features = 0,
+ .ghcb_version = 0,
+ };
+ unsigned long vm_type;
+
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return -EINVAL;
+
+ vm_type = (argp->id == KVM_SEV_INIT ? KVM_X86_SEV_VM : KVM_X86_SEV_ES_VM);
+
+ /*
+ * KVM_SEV_ES_INIT has been deprecated by KVM_SEV_INIT2, so it will
+ * continue to only ever support the minimal GHCB protocol version.
+ */
+ if (vm_type == KVM_X86_SEV_ES_VM)
+ data.ghcb_version = GHCB_VERSION_MIN;
+
+ return __sev_guest_init(kvm, argp, &data, vm_type);
+}
+
+static int sev_guest_init2(struct kvm *kvm, struct kvm_sev_cmd *argp)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct kvm_sev_init data;
+
+ if (!sev->need_init)
+ return -EINVAL;
+
+ if (kvm->arch.vm_type != KVM_X86_SEV_VM &&
+ kvm->arch.vm_type != KVM_X86_SEV_ES_VM)
+ return -EINVAL;
+
+ if (copy_from_user(&data, u64_to_user_ptr(argp->data), sizeof(data)))
+ return -EFAULT;
+
+ return __sev_guest_init(kvm, argp, &data, kvm->arch.vm_type);
+}
+
static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
{
unsigned int asid = sev_get_asid(kvm);
@@ -339,7 +405,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&start, 0, sizeof(start));
@@ -383,7 +449,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* return handle to userspace */
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params))) {
sev_unbind_asid(kvm, start.handle);
ret = -EFAULT;
goto e_free_session;
@@ -522,7 +588,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
vaddr = params.uaddr;
@@ -580,7 +646,13 @@ e_unpin:
static int sev_es_sync_vmsa(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
struct sev_es_save_area *save = svm->sev_es.vmsa;
+ struct xregs_state *xsave;
+ const u8 *s;
+ u8 *d;
+ int i;
/* Check some debug related fields before encrypting the VMSA */
if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1))
@@ -621,10 +693,44 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xss = svm->vcpu.arch.ia32_xss;
save->dr6 = svm->vcpu.arch.dr6;
- if (sev_es_debug_swap_enabled) {
- save->sev_features |= SVM_SEV_FEAT_DEBUG_SWAP;
- pr_warn_once("Enabling DebugSwap with KVM_SEV_ES_INIT. "
- "This will not work starting with Linux 6.10\n");
+ save->sev_features = sev->vmsa_features;
+
+ /*
+ * Skip FPU and AVX setup with KVM_SEV_ES_INIT to avoid
+ * breaking older measurements.
+ */
+ if (vcpu->kvm->arch.vm_type != KVM_X86_DEFAULT_VM) {
+ xsave = &vcpu->arch.guest_fpu.fpstate->regs.xsave;
+ save->x87_dp = xsave->i387.rdp;
+ save->mxcsr = xsave->i387.mxcsr;
+ save->x87_ftw = xsave->i387.twd;
+ save->x87_fsw = xsave->i387.swd;
+ save->x87_fcw = xsave->i387.cwd;
+ save->x87_fop = xsave->i387.fop;
+ save->x87_ds = 0;
+ save->x87_cs = 0;
+ save->x87_rip = xsave->i387.rip;
+
+ for (i = 0; i < 8; i++) {
+ /*
+ * The format of the x87 save area is undocumented and
+ * definitely not what you would expect. It consists of
+ * an 8*8 bytes area with bytes 0-7, and an 8*2 bytes
+ * area with bytes 8-9 of each register.
+ */
+ d = save->fpreg_x87 + i * 8;
+ s = ((u8 *)xsave->i387.st_space) + i * 16;
+ memcpy(d, s, 8);
+ save->fpreg_x87[64 + i * 2] = s[8];
+ save->fpreg_x87[64 + i * 2 + 1] = s[9];
+ }
+ memcpy(save->fpreg_xmm, xsave->i387.xmm_space, 256);
+
+ s = get_xsave_addr(xsave, XFEATURE_YMM);
+ if (s)
+ memcpy(save->fpreg_ymm, s, 256);
+ else
+ memset(save->fpreg_ymm, 0, 256);
}
pr_debug("Virtual Machine Save Area (VMSA):\n");
@@ -658,13 +764,20 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu,
clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE);
vmsa.reserved = 0;
- vmsa.handle = to_kvm_svm(kvm)->sev_info.handle;
+ vmsa.handle = to_kvm_sev_info(kvm)->handle;
vmsa.address = __sme_pa(svm->sev_es.vmsa);
vmsa.len = PAGE_SIZE;
ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error);
if (ret)
return ret;
+ /*
+ * SEV-ES guests maintain an encrypted version of their FPU
+ * state which is restored and saved on VMRUN and VMEXIT.
+ * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
+ * do xsave/xrstor on it.
+ */
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
vcpu->arch.guest_state_protected = true;
return 0;
}
@@ -695,7 +808,7 @@ static int sev_launch_update_vmsa(struct kvm *kvm, struct kvm_sev_cmd *argp)
static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *measure = (void __user *)(uintptr_t)argp->data;
+ void __user *measure = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_launch_measure data;
struct kvm_sev_launch_measure params;
@@ -715,7 +828,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -788,7 +901,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.state = data.state;
params.handle = data.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params, sizeof(params)))
ret = -EFAULT;
return ret;
@@ -953,7 +1066,7 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
+ if (copy_from_user(&debug, u64_to_user_ptr(argp->data), sizeof(debug)))
return -EFAULT;
if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
@@ -1037,7 +1150,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
@@ -1101,7 +1214,7 @@ e_unpin_memory:
static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- void __user *report = (void __user *)(uintptr_t)argp->data;
+ void __user *report = u64_to_user_ptr(argp->data);
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct sev_data_attestation_report data;
struct kvm_sev_attestation_report params;
@@ -1112,7 +1225,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data), sizeof(params)))
return -EFAULT;
memset(&data, 0, sizeof(data));
@@ -1121,7 +1234,7 @@ static int sev_get_attestation_report(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!params.len)
goto cmd;
- p = (void __user *)(uintptr_t)params.uaddr;
+ p = u64_to_user_ptr(params.uaddr);
if (p) {
if (params.len > SEV_FW_BLOB_MAX_SIZE)
return -EINVAL;
@@ -1174,7 +1287,7 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
params->session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1193,7 +1306,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_start)))
return -EFAULT;
@@ -1248,7 +1361,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
- if (!ret && copy_to_user((void __user *)(uintptr_t)params.session_uaddr,
+ if (!ret && copy_to_user(u64_to_user_ptr(params.session_uaddr),
session_data, params.session_len)) {
ret = -EFAULT;
goto e_free_amd_cert;
@@ -1256,7 +1369,7 @@ static int sev_send_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
params.policy = data.policy;
params.session_len = data.session_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, &params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), &params,
sizeof(struct kvm_sev_send_start)))
ret = -EFAULT;
@@ -1287,7 +1400,7 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
- if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
+ if (copy_to_user(u64_to_user_ptr(argp->data), params,
sizeof(struct kvm_sev_send_update_data)))
ret = -EFAULT;
@@ -1307,7 +1420,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_send_update_data)))
return -EFAULT;
@@ -1358,14 +1471,14 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
goto e_free_trans_data;
/* copy transport buffer to user space */
- if (copy_to_user((void __user *)(uintptr_t)params.trans_uaddr,
+ if (copy_to_user(u64_to_user_ptr(params.trans_uaddr),
trans_data, params.trans_len)) {
ret = -EFAULT;
goto e_free_trans_data;
}
/* Copy packet header to userspace. */
- if (copy_to_user((void __user *)(uintptr_t)params.hdr_uaddr, hdr,
+ if (copy_to_user(u64_to_user_ptr(params.hdr_uaddr), hdr,
params.hdr_len))
ret = -EFAULT;
@@ -1417,7 +1530,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return -ENOTTY;
/* Get parameter from the userspace */
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_start)))
return -EFAULT;
@@ -1459,7 +1572,7 @@ static int sev_receive_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
params.handle = start.handle;
- if (copy_to_user((void __user *)(uintptr_t)argp->data,
+ if (copy_to_user(u64_to_user_ptr(argp->data),
&params, sizeof(struct kvm_sev_receive_start))) {
ret = -EFAULT;
sev_unbind_asid(kvm, start.handle);
@@ -1490,7 +1603,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -EINVAL;
- if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data,
+ if (copy_from_user(&params, u64_to_user_ptr(argp->data),
sizeof(struct kvm_sev_receive_update_data)))
return -EFAULT;
@@ -1705,6 +1818,7 @@ static void sev_migrate_from(struct kvm *dst_kvm, struct kvm *src_kvm)
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
dst->es_active = src->es_active;
+ dst->vmsa_features = src->vmsa_features;
src->asid = 0;
src->active = false;
@@ -1812,7 +1926,8 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)
if (ret)
goto out_fput;
- if (sev_guest(kvm) || !sev_guest(source_kvm)) {
+ if (kvm->arch.vm_type != source_kvm->arch.vm_type ||
+ sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_unlock;
}
@@ -1861,6 +1976,21 @@ out_fput:
return ret;
}
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val)
+{
+ if (group != KVM_X86_GRP_SEV)
+ return -ENXIO;
+
+ switch (attr) {
+ case KVM_X86_SEV_VMSA_FEATURES:
+ *val = sev_supported_vmsa_features;
+ return 0;
+
+ default:
+ return -ENXIO;
+ }
+}
+
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
struct kvm_sev_cmd sev_cmd;
@@ -1894,6 +2024,9 @@ int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
case KVM_SEV_INIT:
r = sev_guest_init(kvm, &sev_cmd);
break;
+ case KVM_SEV_INIT2:
+ r = sev_guest_init2(kvm, &sev_cmd);
+ break;
case KVM_SEV_LAUNCH_START:
r = sev_launch_start(kvm, &sev_cmd);
break;
@@ -2121,6 +2254,7 @@ int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
+ mirror_sev->need_init = false;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
INIT_LIST_HEAD(&mirror_sev->mirror_vms);
@@ -2186,15 +2320,18 @@ void sev_vm_destroy(struct kvm *kvm)
void __init sev_set_cpu_caps(void)
{
- if (!sev_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV);
- if (!sev_es_enabled)
- kvm_cpu_cap_clear(X86_FEATURE_SEV_ES);
+ if (sev_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_VM);
+ }
+ if (sev_es_enabled) {
+ kvm_cpu_cap_set(X86_FEATURE_SEV_ES);
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SEV_ES_VM);
+ }
}
void __init sev_hardware_setup(void)
{
-#ifdef CONFIG_KVM_AMD_SEV
unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -2294,7 +2431,10 @@ out:
if (!sev_es_enabled || !cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP) ||
!cpu_feature_enabled(X86_FEATURE_NO_NESTED_DATA_BP))
sev_es_debug_swap_enabled = false;
-#endif
+
+ sev_supported_vmsa_features = 0;
+ if (sev_es_debug_swap_enabled)
+ sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
}
void sev_hardware_unsetup(void)
@@ -2585,6 +2725,8 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
case SVM_VMGEXIT_AP_HLT_LOOP:
case SVM_VMGEXIT_AP_JUMP_TABLE:
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
+ case SVM_VMGEXIT_HV_FEATURES:
+ case SVM_VMGEXIT_TERM_REQUEST:
break;
default:
reason = GHCB_ERR_INVALID_EVENT;
@@ -2615,6 +2757,9 @@ vmgexit_err:
void sev_es_unmap_ghcb(struct vcpu_svm *svm)
{
+ /* Clear any indication that the vCPU is in a type of AP Reset Hold */
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NONE;
+
if (!svm->sev_es.ghcb)
return;
@@ -2774,6 +2919,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
u64 ghcb_info;
int ret = 1;
@@ -2784,7 +2930,7 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
switch (ghcb_info) {
case GHCB_MSR_SEV_INFO_REQ:
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
break;
@@ -2826,6 +2972,28 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
GHCB_MSR_INFO_POS);
break;
}
+ case GHCB_MSR_AP_RESET_HOLD_REQ:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_MSR_PROTO;
+ ret = kvm_emulate_ap_reset_hold(&svm->vcpu);
+
+ /*
+ * Preset the result to a non-SIPI return and then only set
+ * the result to non-zero when delivering a SIPI.
+ */
+ set_ghcb_msr_bits(svm, 0,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
+
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ case GHCB_MSR_HV_FT_REQ:
+ set_ghcb_msr_bits(svm, GHCB_HV_FT_SUPPORTED,
+ GHCB_MSR_HV_FT_MASK, GHCB_MSR_HV_FT_POS);
+ set_ghcb_msr_bits(svm, GHCB_MSR_HV_FT_RESP,
+ GHCB_MSR_INFO_MASK, GHCB_MSR_INFO_POS);
+ break;
case GHCB_MSR_TERM_REQ: {
u64 reason_set, reason_code;
@@ -2925,6 +3093,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
case SVM_VMGEXIT_AP_HLT_LOOP:
+ svm->sev_es.ap_reset_hold_type = AP_RESET_HOLD_NAE_EVENT;
ret = kvm_emulate_ap_reset_hold(vcpu);
break;
case SVM_VMGEXIT_AP_JUMP_TABLE: {
@@ -2949,6 +3118,19 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
ret = 1;
break;
}
+ case SVM_VMGEXIT_HV_FEATURES:
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_HV_FT_SUPPORTED);
+
+ ret = 1;
+ break;
+ case SVM_VMGEXIT_TERM_REQUEST:
+ pr_info("SEV-ES guest requested termination: reason %#llx info %#llx\n",
+ control->exit_info_1, control->exit_info_2);
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+ break;
case SVM_VMGEXIT_UNSUPPORTED_EVENT:
vcpu_unimpl(vcpu,
"vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
@@ -3063,7 +3245,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm)
svm_set_intercept(svm, TRAP_CR8_WRITE);
vmcb->control.intercepts[INTERCEPT_DR] = 0;
- if (!sev_es_debug_swap_enabled) {
+ if (!sev_vcpu_has_debug_swap(svm)) {
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ);
vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
@@ -3109,16 +3291,19 @@ void sev_init_vmcb(struct vcpu_svm *svm)
void sev_es_vcpu_reset(struct vcpu_svm *svm)
{
+ struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
+
/*
* Set the GHCB MSR value as per the GHCB specification when emulating
* vCPU RESET for an SEV-ES guest.
*/
- set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
+ set_ghcb_msr(svm, GHCB_MSR_SEV_INFO((__u64)sev->ghcb_version,
GHCB_VERSION_MIN,
sev_enc_bit));
}
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
{
/*
* All host state for SEV-ES guests is categorized into three swap types
@@ -3146,7 +3331,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
* saves and loads debug registers (Type-A).
*/
- if (sev_es_debug_swap_enabled) {
+ if (sev_vcpu_has_debug_swap(svm)) {
hostsa->dr0 = native_get_debugreg(0);
hostsa->dr1 = native_get_debugreg(1);
hostsa->dr2 = native_get_debugreg(2);
@@ -3168,15 +3353,31 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
return;
}
- /*
- * Subsequent SIPI: Return from an AP Reset Hold VMGEXIT, where
- * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a
- * non-zero value.
- */
- if (!svm->sev_es.ghcb)
- return;
+ /* Subsequent SIPI */
+ switch (svm->sev_es.ap_reset_hold_type) {
+ case AP_RESET_HOLD_NAE_EVENT:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set SW_EXIT_INFO_2 to a non-zero value.
+ */
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ break;
+ case AP_RESET_HOLD_MSR_PROTO:
+ /*
+ * Return from an AP Reset Hold VMGEXIT, where the guest will
+ * set the CS and RIP. Set GHCB data field to a non-zero value.
+ */
+ set_ghcb_msr_bits(svm, 1,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_MASK,
+ GHCB_MSR_AP_RESET_HOLD_RESULT_POS);
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
+ set_ghcb_msr_bits(svm, GHCB_MSR_AP_RESET_HOLD_RESP,
+ GHCB_MSR_INFO_MASK,
+ GHCB_MSR_INFO_POS);
+ break;
+ default:
+ break;
+ }
}
struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9aaf83c8d57d..c8dc25886c16 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1433,14 +1433,6 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
vmsa_page = snp_safe_alloc_page(vcpu);
if (!vmsa_page)
goto error_free_vmcb_page;
-
- /*
- * SEV-ES guests maintain an encrypted version of their FPU
- * state which is restored and saved on VMRUN and VMEXIT.
- * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't
- * do xsave/xrstor on it.
- */
- fpstate_set_confidential(&vcpu->arch.guest_fpu);
}
err = avic_init_vcpu(svm);
@@ -1525,7 +1517,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
*/
vmsave(sd->save_area_pa);
if (sev_es_guest(vcpu->kvm))
- sev_es_prepare_switch_to_guest(sev_es_host_save_area(sd));
+ sev_es_prepare_switch_to_guest(svm, sev_es_host_save_area(sd));
if (tsc_scaling)
__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
@@ -2056,6 +2048,15 @@ static int npf_interception(struct kvm_vcpu *vcpu)
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
+ /*
+ * WARN if hardware generates a fault with an error code that collides
+ * with KVM-defined sythentic flags. Clear the flags and continue on,
+ * i.e. don't terminate the VM, as KVM can't possibly be relying on a
+ * flag that KVM doesn't know about.
+ */
+ if (WARN_ON_ONCE(error_code & PFERR_SYNTHETIC_MASK))
+ error_code &= ~PFERR_SYNTHETIC_MASK;
+
trace_kvm_page_fault(vcpu, fault_address, error_code);
return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
@@ -3304,7 +3305,9 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
+#ifdef CONFIG_KVM_AMD_SEV
[SVM_EXIT_VMGEXIT] = sev_handle_vmgexit,
+#endif
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4085,6 +4088,9 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
+ if (to_kvm_sev_info(vcpu->kvm)->need_init)
+ return -EINVAL;
+
return 1;
}
@@ -4892,6 +4898,14 @@ static void svm_vm_destroy(struct kvm *kvm)
static int svm_vm_init(struct kvm *kvm)
{
+ int type = kvm->arch.vm_type;
+
+ if (type != KVM_X86_DEFAULT_VM &&
+ type != KVM_X86_SW_PROTECTED_VM) {
+ kvm->arch.has_protected_state = (type == KVM_X86_SEV_ES_VM);
+ to_kvm_sev_info(kvm)->need_init = true;
+ }
+
if (!pause_filter_count || !pause_filter_thresh)
kvm->arch.pause_in_guest = true;
@@ -5026,6 +5040,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_smi_window = svm_enable_smi_window,
#endif
+#ifdef CONFIG_KVM_AMD_SEV
+ .dev_get_attr = sev_dev_get_attr,
.mem_enc_ioctl = sev_mem_enc_ioctl,
.mem_enc_register_region = sev_mem_enc_register_region,
.mem_enc_unregister_region = sev_mem_enc_unregister_region,
@@ -5033,7 +5049,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vm_copy_enc_context_from = sev_vm_copy_enc_context_from,
.vm_move_enc_context_from = sev_vm_move_enc_context_from,
-
+#endif
.check_emulate_instruction = svm_check_emulate_instruction,
.apic_init_signal_blocked = svm_apic_init_signal_blocked,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 33878efdebc8..be57213cd295 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -79,12 +79,15 @@ enum {
struct kvm_sev_info {
bool active; /* SEV enabled guest */
bool es_active; /* SEV-ES enabled guest */
+ bool need_init; /* waiting for SEV_INIT2 */
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ u64 vmsa_features;
+ u16 ghcb_version; /* Highest guest GHCB protocol version allowed */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
struct list_head mirror_vms; /* List of VMs mirroring */
struct list_head mirror_entry; /* Use as a list entry of mirrors */
@@ -197,6 +200,7 @@ struct vcpu_sev_es_state {
u8 valid_bitmap[16];
struct kvm_host_map ghcb_map;
bool received_first_sipi;
+ unsigned int ap_reset_hold_type;
/* SEV-ES scratch area support */
u64 sw_scratch;
@@ -318,6 +322,11 @@ static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
return container_of(kvm, struct kvm_svm, kvm);
}
+static __always_inline struct kvm_sev_info *to_kvm_sev_info(struct kvm *kvm)
+{
+ return &to_kvm_svm(kvm)->sev_info;
+}
+
static __always_inline bool sev_guest(struct kvm *kvm)
{
#ifdef CONFIG_KVM_AMD_SEV
@@ -664,13 +673,16 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
-#define GHCB_VERSION_MAX 1ULL
-#define GHCB_VERSION_MIN 1ULL
-
-
-extern unsigned int max_sev_asid;
+void pre_sev_run(struct vcpu_svm *svm, int cpu);
+void sev_init_vmcb(struct vcpu_svm *svm);
+void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
+int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
+void sev_es_vcpu_reset(struct vcpu_svm *svm);
+void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
+void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa);
+void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-void sev_vm_destroy(struct kvm *kvm);
+#ifdef CONFIG_KVM_AMD_SEV
int sev_mem_enc_ioctl(struct kvm *kvm, void __user *argp);
int sev_mem_enc_register_region(struct kvm *kvm,
struct kvm_enc_region *range);
@@ -679,22 +691,32 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
int sev_vm_copy_enc_context_from(struct kvm *kvm, unsigned int source_fd);
int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd);
void sev_guest_memory_reclaimed(struct kvm *kvm);
+int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-void pre_sev_run(struct vcpu_svm *svm, int cpu);
+/* These symbols are used in common code and are stubbed below. */
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+void sev_free_vcpu(struct kvm_vcpu *vcpu);
+void sev_vm_destroy(struct kvm *kvm);
void __init sev_set_cpu_caps(void);
void __init sev_hardware_setup(void);
void sev_hardware_unsetup(void);
int sev_cpu_init(struct svm_cpu_data *sd);
-void sev_init_vmcb(struct vcpu_svm *svm);
-void sev_vcpu_after_set_cpuid(struct vcpu_svm *svm);
-void sev_free_vcpu(struct kvm_vcpu *vcpu);
-int sev_handle_vmgexit(struct kvm_vcpu *vcpu);
-int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in);
-void sev_es_vcpu_reset(struct vcpu_svm *svm);
-void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
-void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
-void sev_es_unmap_ghcb(struct vcpu_svm *svm);
-struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
+int sev_dev_get_attr(u32 group, u64 attr, u64 *val);
+extern unsigned int max_sev_asid;
+#else
+static inline struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu) {
+ return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+}
+
+static inline void sev_free_vcpu(struct kvm_vcpu *vcpu) {}
+static inline void sev_vm_destroy(struct kvm *kvm) {}
+static inline void __init sev_set_cpu_caps(void) {}
+static inline void __init sev_hardware_setup(void) {}
+static inline void sev_hardware_unsetup(void) {}
+static inline int sev_cpu_init(struct svm_cpu_data *sd) { return 0; }
+static inline int sev_dev_get_attr(u32 group, u64 attr, u64 *val) { return -ENXIO; }
+#define max_sev_asid 0
+#endif
/* vmenter.S */
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
new file mode 100644
index 000000000000..7c546ad3e4c9
--- /dev/null
+++ b/arch/x86/kvm/vmx/main.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/moduleparam.h>
+
+#include "x86_ops.h"
+#include "vmx.h"
+#include "nested.h"
+#include "pmu.h"
+
+#define VMX_REQUIRED_APICV_INHIBITS \
+ (BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED))
+
+struct kvm_x86_ops vt_x86_ops __initdata = {
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
+
+ .hardware_unsetup = vmx_hardware_unsetup,
+
+ .hardware_enable = vmx_hardware_enable,
+ .hardware_disable = vmx_hardware_disable,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+ .vm_destroy = vmx_vm_destroy,
+
+ .vcpu_precreate = vmx_vcpu_precreate,
+ .vcpu_create = vmx_vcpu_create,
+ .vcpu_free = vmx_vcpu_free,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = vmx_update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .is_valid_cr0 = vmx_is_valid_cr0,
+ .set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
+
+ .flush_tlb_all = vmx_flush_tlb_all,
+ .flush_tlb_current = vmx_flush_tlb_current,
+ .flush_tlb_gva = vmx_flush_tlb_gva,
+ .flush_tlb_guest = vmx_flush_tlb_guest,
+
+ .vcpu_pre_run = vmx_vcpu_pre_run,
+ .vcpu_run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .inject_irq = vmx_inject_irq,
+ .inject_nmi = vmx_inject_nmi,
+ .inject_exception = vmx_inject_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = vmx_enable_nmi_window,
+ .enable_irq_window = vmx_enable_irq_window,
+ .update_cr8_intercept = vmx_update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_interrupt = vmx_deliver_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
+ .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
+ .write_tsc_offset = vmx_write_tsc_offset,
+ .write_tsc_multiplier = vmx_write_tsc_multiplier,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .sched_in = vmx_sched_in,
+
+ .cpu_dirty_log_size = PML_ENTITY_NUM,
+ .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
+
+ .nested_ops = &vmx_nested_ops,
+
+ .pi_update_irte = vmx_pi_update_irte,
+ .pi_start_assignment = vmx_pi_start_assignment,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+#ifdef CONFIG_KVM_SMM
+ .smi_allowed = vmx_smi_allowed,
+ .enter_smm = vmx_enter_smm,
+ .leave_smm = vmx_leave_smm,
+ .enable_smi_window = vmx_enable_smi_window,
+#endif
+
+ .check_emulate_instruction = vmx_check_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+ .complete_emulated_msr = kvm_complete_insn_gp,
+
+ .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
+
+ .get_untagged_addr = vmx_get_untagged_addr,
+};
+
+struct kvm_x86_init_ops vt_init_ops __initdata = {
+ .hardware_setup = vmx_hardware_setup,
+ .handle_intel_pt_intr = NULL,
+
+ .runtime_ops = &vt_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
+};
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7c1996b433e2..b25625314658 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -140,6 +140,11 @@ static inline bool is_nm_fault(u32 intr_info)
return is_exception_n(intr_info, NM_VECTOR);
}
+static inline bool is_ve_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, VE_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 22411f4aff53..ad36e5eb6667 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -68,6 +68,7 @@
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
+#include "x86_ops.h"
#include "smm.h"
#include "vmx_onhyperv.h"
@@ -530,8 +531,6 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
-static struct kvm_x86_ops vmx_x86_ops __initdata;
-
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -581,9 +580,8 @@ static __init void hv_init_evmcs(void)
}
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
+ vt_x86_ops.enable_l2_tlb_flush
= hv_enable_l2_tlb_flush;
-
} else {
enlightened_vmcs = false;
}
@@ -874,6 +872,12 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
+ * #VE isn't used for VMX. To test against unexpected changes
+ * related to #VE for VMX, intercept unexpected #VE and warn on it.
+ */
+ if (IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ eb |= 1u << VE_VECTOR;
+ /*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
@@ -1477,7 +1481,7 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1488,7 +1492,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
+void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1547,7 +1551,7 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
-static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
{
return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
}
@@ -1653,8 +1657,8 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
- void *insn, int insn_len)
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
@@ -1738,7 +1742,7 @@ rip_updated:
* Recognizes a pending MTF VM-exit and records the nested state for later
* delivery.
*/
-static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1769,7 +1773,7 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
}
}
-static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
vmx_update_emulated_instruction(vcpu);
return skip_emulated_instruction(vcpu);
@@ -1788,7 +1792,7 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_inject_exception(struct kvm_vcpu *vcpu)
+void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
struct kvm_queued_exception *ex = &vcpu->arch.exception;
u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
@@ -1909,12 +1913,12 @@ u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
return kvm_caps.default_tsc_scaling_ratio;
}
-static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
}
-static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
{
vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio);
}
@@ -1957,7 +1961,7 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
@@ -1974,7 +1978,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2155,7 +2159,7 @@ static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmx_uret_msr *msr;
@@ -2458,7 +2462,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return ret;
}
-static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
unsigned long guest_owned_bits;
@@ -2606,6 +2610,9 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_2nd_exec_control))
return -EIO;
}
+ if (!IS_ENABLED(CONFIG_KVM_INTEL_PROVE_VE))
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
+
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
@@ -2630,6 +2637,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmx_cap->ept = 0;
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
@@ -2759,7 +2767,7 @@ static bool kvm_is_vmx_supported(void)
return supported;
}
-static int vmx_check_processor_compat(void)
+int vmx_check_processor_compat(void)
{
int cpu = raw_smp_processor_id();
struct vmcs_config vmcs_conf;
@@ -2801,7 +2809,7 @@ fault:
return -EFAULT;
}
-static int vmx_hardware_enable(void)
+int vmx_hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2841,7 +2849,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-static void vmx_hardware_disable(void)
+void vmx_hardware_disable(void)
{
vmclear_local_loaded_vmcss();
@@ -3155,7 +3163,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
-static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3185,7 +3193,7 @@ static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
return to_vmx(vcpu)->vpid;
}
-static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
u64 root_hpa = mmu->root.hpa;
@@ -3201,7 +3209,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
@@ -3210,7 +3218,7 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
@@ -3255,7 +3263,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
CPU_BASED_CR3_STORE_EXITING)
-static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (is_guest_mode(vcpu))
return nested_guest_cr0_valid(vcpu, cr0);
@@ -3376,8 +3384,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
return eptp;
}
-static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
- int root_level)
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
@@ -3406,8 +3413,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
-
-static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
* We operate under the default treatment of SMM, so VMX cannot be
@@ -3523,7 +3529,7 @@ void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->g = (ar >> 15) & 1;
}
-static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
@@ -3600,14 +3606,14 @@ void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
}
-static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
__vmx_set_segment(vcpu, var, seg);
to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
}
-static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
@@ -3615,25 +3621,25 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
*l = (ar >> 13) & 1;
}
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
@@ -4101,7 +4107,7 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
}
}
-static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
+bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
void *vapic_page;
@@ -4121,7 +4127,7 @@ static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
return ((rvi & 0xf0) > (vppr & 0xf0));
}
-static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 i;
@@ -4265,8 +4271,8 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
return 0;
}
-static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
- int trig_mode, int vector)
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector)
{
struct kvm_vcpu *vcpu = apic->vcpu;
@@ -4428,7 +4434,7 @@ static u32 vmx_vmexit_ctrl(void)
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
-static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4594,6 +4600,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+ exec_control &= ~SECONDARY_EXEC_EPT_VIOLATION_VE;
enable_unrestricted_guest = 0;
}
if (!enable_unrestricted_guest)
@@ -4692,7 +4699,7 @@ static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
return 0;
}
-static int vmx_vcpu_precreate(struct kvm *kvm)
+int vmx_vcpu_precreate(struct kvm *kvm)
{
return vmx_alloc_ipiv_pid_table(kvm);
}
@@ -4717,8 +4724,12 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls())
+ if (cpu_has_secondary_exec_ctrls()) {
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
+ if (vmx->ve_info)
+ vmcs_write64(VE_INFORMATION_ADDRESS,
+ __pa(vmx->ve_info));
+ }
if (cpu_has_tertiary_exec_ctrls())
tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx));
@@ -4847,7 +4858,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->pi_desc.sn = 1;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4906,12 +4917,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_update_fb_clear_dis(vcpu, vmx);
}
-static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
{
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
-static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
{
if (!enable_vnmi ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
@@ -4922,7 +4933,7 @@ static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
uint32_t intr;
@@ -4950,7 +4961,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
vmx_clear_hlt(vcpu);
}
-static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
+void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5028,7 +5039,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
GUEST_INTR_STATE_NMI));
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5050,7 +5061,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
if (to_vmx(vcpu)->nested.nested_run_pending)
return -EBUSY;
@@ -5065,7 +5076,7 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !vmx_interrupt_blocked(vcpu);
}
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
void __user *ret;
@@ -5085,7 +5096,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
return init_rmode_tss(kvm, ret);
}
-static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
{
to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
return 0;
@@ -5206,6 +5217,9 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
+ if (KVM_BUG_ON(is_ve_fault(intr_info), vcpu->kvm))
+ return -EIO;
+
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
@@ -5371,8 +5385,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
return kvm_fast_pio(vcpu, size, port, in);
}
-static void
-vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
/*
* Patch in the VMCALL instruction:
@@ -5578,7 +5591,7 @@ out:
return kvm_complete_insn_gp(vcpu, err);
}
-static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
@@ -5597,7 +5610,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
set_debugreg(DR6_RESERVED, 6);
}
-static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
{
vmcs_writel(GUEST_DR7, val);
}
@@ -5868,7 +5881,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
-static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
{
if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
@@ -6156,9 +6169,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
- u64 *info1, u64 *info2,
- u32 *intr_info, u32 *error_code)
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6416,6 +6428,24 @@ void dump_vmcs(struct kvm_vcpu *vcpu)
if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
pr_err("Virtual processor ID = 0x%04x\n",
vmcs_read16(VIRTUAL_PROCESSOR_ID));
+ if (secondary_exec_control & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct vmx_ve_information *ve_info = vmx->ve_info;
+ u64 ve_info_pa = vmcs_read64(VE_INFORMATION_ADDRESS);
+
+ /*
+ * If KVM is dumping the VMCS, then something has gone wrong
+ * already. Derefencing an address from the VMCS, which could
+ * very well be corrupted, is a terrible idea. The virtual
+ * address is known so use it.
+ */
+ pr_err("VE info address = 0x%016llx%s\n", ve_info_pa,
+ ve_info_pa == __pa(ve_info) ? "" : "(corrupted!)");
+ pr_err("ve_info: 0x%08x 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%04x\n",
+ ve_info->exit_reason, ve_info->delivery,
+ ve_info->exit_qualification,
+ ve_info->guest_linear_address,
+ ve_info->guest_physical_address, ve_info->eptp_index);
+ }
}
/*
@@ -6601,7 +6631,7 @@ unexpected_vmexit:
return 0;
}
-static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
int ret = __vmx_handle_exit(vcpu, exit_fastpath);
@@ -6689,7 +6719,7 @@ static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
: "eax", "ebx", "ecx", "edx");
}
-static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
int tpr_threshold;
@@ -6759,7 +6789,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
vmx_update_msr_bitmap_x2apic(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -6828,7 +6858,7 @@ out:
kvm_release_pfn_clean(pfn);
}
-static void vmx_hwapic_isr_update(int max_isr)
+void vmx_hwapic_isr_update(int max_isr)
{
u16 status;
u8 old;
@@ -6862,7 +6892,7 @@ static void vmx_set_rvi(int vector)
}
}
-static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
/*
* When running L2, updating RVI is only relevant when
@@ -6876,7 +6906,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
vmx_set_rvi(max_irr);
}
-static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -6922,7 +6952,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
return max_irr;
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
return;
@@ -6933,7 +6963,7 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
-static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6964,24 +6994,22 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct kvm_vcpu *vcpu, u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
-
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
- vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ vcpu->arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* if exit due to NM, handle before interrupts are enabled */
else if (is_nm_fault(intr_info))
- handle_nm_fault_irqoff(&vmx->vcpu);
+ handle_nm_fault_irqoff(vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
}
-static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
+static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu,
+ u32 intr_info)
{
- u32 intr_info = vmx_get_intr_info(vcpu);
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
@@ -6998,7 +7026,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
vcpu->arch.at_instruction_boundary = true;
}
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7006,16 +7034,16 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
return;
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
- handle_external_interrupt_irqoff(vcpu);
+ handle_external_interrupt_irqoff(vcpu, vmx_get_intr_info(vcpu));
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_irqoff(vmx);
+ handle_exception_irqoff(vcpu, vmx_get_intr_info(vcpu));
}
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
*/
-static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -7138,7 +7166,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
IDT_VECTORING_ERROR_CODE);
}
-static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
+void vmx_cancel_injection(struct kvm_vcpu *vcpu)
{
__vmx_complete_interrupts(vcpu,
vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
@@ -7308,7 +7336,7 @@ out:
guest_state_exit_irqoff();
}
-static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
@@ -7463,7 +7491,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
}
-static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
+void vmx_vcpu_free(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -7472,9 +7500,10 @@ static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
+ free_page((unsigned long)vmx->ve_info);
}
-static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
+int vmx_vcpu_create(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
struct vcpu_vmx *vmx;
@@ -7565,6 +7594,20 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
goto free_vmcs;
}
+ err = -ENOMEM;
+ if (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_EPT_VIOLATION_VE) {
+ struct page *page;
+
+ BUILD_BUG_ON(sizeof(*vmx->ve_info) > PAGE_SIZE);
+
+ /* ve_info must be page aligned. */
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!page)
+ goto free_vmcs;
+
+ vmx->ve_info = page_to_virt(page);
+ }
+
if (vmx_can_use_ipiv(vcpu))
WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
__pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID);
@@ -7583,7 +7626,7 @@ free_vpid:
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
-static int vmx_vm_init(struct kvm *kvm)
+int vmx_vm_init(struct kvm *kvm)
{
if (!ple_gap)
kvm->arch.pause_in_guest = true;
@@ -7614,7 +7657,7 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -7786,7 +7829,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8001,10 +8044,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
}
-static int vmx_check_intercept(struct kvm_vcpu *vcpu,
- struct x86_instruction_info *info,
- enum x86_intercept_stage stage,
- struct x86_exception *exception)
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -8084,8 +8127,8 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift,
return 0;
}
-static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
- bool *expired)
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired)
{
struct vcpu_vmx *vmx;
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
@@ -8124,13 +8167,13 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
return 0;
}
-static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
{
to_vmx(vcpu)->hv_deadline_tsc = -1;
}
#endif
-static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (!kvm_pause_in_guest(vcpu->kvm))
shrink_ple_window(vcpu);
@@ -8159,7 +8202,7 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -8170,7 +8213,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_KVM_SMM
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
@@ -8178,7 +8221,7 @@ static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
return !is_smm(vcpu);
}
-static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8199,7 +8242,7 @@ static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram)
return 0;
}
-static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int ret;
@@ -8220,18 +8263,18 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram)
return 0;
}
-static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
{
/* RSM will cause a vmexit anyway. */
}
#endif
-static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
}
-static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+void vmx_migrate_timers(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
@@ -8241,7 +8284,7 @@ static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
}
}
-static void vmx_hardware_unsetup(void)
+void vmx_hardware_unsetup(void)
{
kvm_set_posted_intr_wakeup_handler(NULL);
@@ -8251,18 +8294,7 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-#define VMX_REQUIRED_APICV_INHIBITS \
-( \
- BIT(APICV_INHIBIT_REASON_DISABLE)| \
- BIT(APICV_INHIBIT_REASON_ABSENT) | \
- BIT(APICV_INHIBIT_REASON_HYPERV) | \
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
- BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
-)
-
-static void vmx_vm_destroy(struct kvm *kvm)
+void vmx_vm_destroy(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
@@ -8313,148 +8345,6 @@ gva_t vmx_get_untagged_addr(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags
return (sign_extend64(gva, lam_bit) & ~BIT_ULL(63)) | (gva & BIT_ULL(63));
}
-static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = KBUILD_MODNAME,
-
- .check_processor_compatibility = vmx_check_processor_compat,
-
- .hardware_unsetup = vmx_hardware_unsetup,
-
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
- .has_emulated_msr = vmx_has_emulated_msr,
-
- .vm_size = sizeof(struct kvm_vmx),
- .vm_init = vmx_vm_init,
- .vm_destroy = vmx_vm_destroy,
-
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_switch_to_guest = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .is_valid_cr0 = vmx_is_valid_cr0,
- .set_cr0 = vmx_set_cr0,
- .is_valid_cr4 = vmx_is_valid_cr4,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
- .get_if_flag = vmx_get_if_flag,
-
- .flush_tlb_all = vmx_flush_tlb_all,
- .flush_tlb_current = vmx_flush_tlb_current,
- .flush_tlb_gva = vmx_flush_tlb_gva,
- .flush_tlb_guest = vmx_flush_tlb_guest,
-
- .vcpu_pre_run = vmx_vcpu_pre_run,
- .vcpu_run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = vmx_skip_emulated_instruction,
- .update_emulated_instruction = vmx_update_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .inject_irq = vmx_inject_irq,
- .inject_nmi = vmx_inject_nmi,
- .inject_exception = vmx_inject_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = vmx_enable_nmi_window,
- .enable_irq_window = vmx_enable_irq_window,
- .update_cr8_intercept = vmx_update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_pre_state_restore = vmx_apicv_pre_state_restore,
- .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_interrupt = vmx_deliver_interrupt,
- .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .get_l2_tsc_offset = vmx_get_l2_tsc_offset,
- .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
- .write_tsc_offset = vmx_write_tsc_offset,
- .write_tsc_multiplier = vmx_write_tsc_multiplier,
-
- .load_mmu_pgd = vmx_load_mmu_pgd,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
-
- .sched_in = vmx_sched_in,
-
- .cpu_dirty_log_size = PML_ENTITY_NUM,
- .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
-
- .nested_ops = &vmx_nested_ops,
-
- .pi_update_irte = vmx_pi_update_irte,
- .pi_start_assignment = vmx_pi_start_assignment,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
-#ifdef CONFIG_KVM_SMM
- .smi_allowed = vmx_smi_allowed,
- .enter_smm = vmx_enter_smm,
- .leave_smm = vmx_leave_smm,
- .enable_smi_window = vmx_enable_smi_window,
-#endif
-
- .check_emulate_instruction = vmx_check_emulate_instruction,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
- .migrate_timers = vmx_migrate_timers,
-
- .msr_filter_changed = vmx_msr_filter_changed,
- .complete_emulated_msr = kvm_complete_insn_gp,
-
- .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
-
- .get_untagged_addr = vmx_get_untagged_addr,
-};
-
static unsigned int vmx_handle_intel_pt_intr(void)
{
struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
@@ -8520,9 +8410,7 @@ static void __init vmx_setup_me_spte_mask(void)
kvm_mmu_set_me_spte_mask(0, me_mask);
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata;
-
-static __init int hardware_setup(void)
+__init int vmx_hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
@@ -8591,16 +8479,16 @@ static __init int hardware_setup(void)
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- vmx_x86_ops.set_apic_access_page_addr = NULL;
+ vt_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- vmx_x86_ops.update_cr8_intercept = NULL;
+ vt_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
- vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
+ vt_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
+ vt_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
}
#endif
@@ -8615,7 +8503,7 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
- vmx_x86_ops.sync_pir_to_irr = NULL;
+ vt_x86_ops.sync_pir_to_irr = NULL;
if (!enable_apicv || !cpu_has_vmx_ipiv())
enable_ipiv = false;
@@ -8651,7 +8539,7 @@ static __init int hardware_setup(void)
enable_pml = 0;
if (!enable_pml)
- vmx_x86_ops.cpu_dirty_log_size = 0;
+ vt_x86_ops.cpu_dirty_log_size = 0;
if (!cpu_has_vmx_preemption_timer())
enable_preemption_timer = false;
@@ -8676,8 +8564,8 @@ static __init int hardware_setup(void)
}
if (!enable_preemption_timer) {
- vmx_x86_ops.set_hv_timer = NULL;
- vmx_x86_ops.cancel_hv_timer = NULL;
+ vt_x86_ops.set_hv_timer = NULL;
+ vt_x86_ops.cancel_hv_timer = NULL;
}
kvm_caps.supported_mce_cap |= MCG_LMCE_P;
@@ -8688,9 +8576,9 @@ static __init int hardware_setup(void)
if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
if (pt_mode == PT_MODE_HOST_GUEST)
- vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ vt_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
else
- vmx_init_ops.handle_intel_pt_intr = NULL;
+ vt_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -8713,14 +8601,6 @@ static __init int hardware_setup(void)
return r;
}
-static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .hardware_setup = hardware_setup,
- .handle_intel_pt_intr = NULL,
-
- .runtime_ops = &vmx_x86_ops,
- .pmu_ops = &intel_pmu_ops,
-};
-
static void vmx_cleanup_l1d_flush(void)
{
if (vmx_l1d_flush_pages) {
@@ -8762,7 +8642,7 @@ static int __init vmx_init(void)
*/
hv_init_evmcs();
- r = kvm_x86_vendor_init(&vmx_init_ops);
+ r = kvm_x86_vendor_init(&vt_init_ops);
if (r)
return r;
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 90f9e4434646..e4a605c5808e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -365,6 +365,9 @@ struct vcpu_vmx {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
} shadow_msr_intercept;
+
+ /* ve_info must be page aligned. */
+ struct vmx_ve_information *ve_info;
};
struct kvm_vmx {
@@ -577,7 +580,8 @@ static inline u8 vmx_get_rvi(void)
SECONDARY_EXEC_ENABLE_VMFUNC | \
SECONDARY_EXEC_BUS_LOCK_DETECTION | \
SECONDARY_EXEC_NOTIFY_VM_EXITING | \
- SECONDARY_EXEC_ENCLS_EXITING)
+ SECONDARY_EXEC_ENCLS_EXITING | \
+ SECONDARY_EXEC_EPT_VIOLATION_VE)
#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
new file mode 100644
index 000000000000..502704596c83
--- /dev/null
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_X86_OPS_H
+#define __KVM_X86_VMX_X86_OPS_H
+
+#include <linux/kvm_host.h>
+
+#include "x86.h"
+
+__init int vmx_hardware_setup(void);
+
+extern struct kvm_x86_ops vt_x86_ops __initdata;
+extern struct kvm_x86_init_ops vt_init_ops __initdata;
+
+void vmx_hardware_unsetup(void);
+int vmx_check_processor_compat(void);
+int vmx_hardware_enable(void);
+void vmx_hardware_disable(void);
+int vmx_vm_init(struct kvm *kvm);
+void vmx_vm_destroy(struct kvm *kvm);
+int vmx_vcpu_precreate(struct kvm *kvm);
+int vmx_vcpu_create(struct kvm_vcpu *vcpu);
+int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu);
+fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit);
+void vmx_vcpu_free(struct kvm_vcpu *vcpu);
+void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_put(struct kvm_vcpu *vcpu);
+int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath);
+void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu);
+int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu);
+int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+#ifdef CONFIG_KVM_SMM
+int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram);
+int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
+void vmx_enable_smi_window(struct kvm_vcpu *vcpu);
+#endif
+int vmx_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
+int vmx_check_intercept(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info,
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception);
+bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu);
+void vmx_migrate_timers(struct kvm_vcpu *vcpu);
+void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu);
+bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
+void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
+void vmx_hwapic_isr_update(int max_isr);
+bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu);
+void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+ int trig_mode, int vector);
+void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
+bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
+void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
+void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
+void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
+int vmx_get_msr_feature(struct kvm_msr_entry *msr);
+int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
+void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+int vmx_get_cpl(struct kvm_vcpu *vcpu);
+void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val);
+void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu);
+void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg);
+unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
+void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool vmx_get_if_flag(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_all(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_current(struct kvm_vcpu *vcpu);
+void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr);
+void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu);
+void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
+u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
+void vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall);
+void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected);
+void vmx_inject_nmi(struct kvm_vcpu *vcpu);
+void vmx_inject_exception(struct kvm_vcpu *vcpu);
+void vmx_cancel_injection(struct kvm_vcpu *vcpu);
+int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection);
+bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+void vmx_enable_nmi_window(struct kvm_vcpu *vcpu);
+void vmx_enable_irq_window(struct kvm_vcpu *vcpu);
+void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr);
+void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu);
+void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
+void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr);
+u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
+void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+ u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code);
+u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu);
+u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_offset(struct kvm_vcpu *vcpu);
+void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu);
+void vmx_request_immediate_exit(struct kvm_vcpu *vcpu);
+void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu);
+void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu);
+#ifdef CONFIG_X86_64
+int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
+ bool *expired);
+void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
+#endif
+void vmx_setup_mce(struct kvm_vcpu *vcpu);
+
+#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91478b769af0..fda22b3800a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,9 +92,12 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-struct kvm_caps kvm_caps __read_mostly = {
- .supported_mce_cap = MCG_CTL_P | MCG_SER_P,
-};
+/*
+ * Note, kvm_caps fields should *never* have default values, all fields must be
+ * recomputed from scratch during vendor module load, e.g. to account for a
+ * vendor module being reloaded with different module parameters.
+ */
+struct kvm_caps kvm_caps __read_mostly;
EXPORT_SYMBOL_GPL(kvm_caps);
#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e))
@@ -4629,9 +4632,7 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
static bool kvm_is_vm_type_supported(unsigned long type)
{
- return type == KVM_X86_DEFAULT_VM ||
- (type == KVM_X86_SW_PROTECTED_VM &&
- IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled);
+ return type < 32 && (kvm_caps.supported_vm_types & BIT(type));
}
int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
@@ -4832,9 +4833,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = kvm_caps.has_notify_vmexit;
break;
case KVM_CAP_VM_TYPES:
- r = BIT(KVM_X86_DEFAULT_VM);
- if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM))
- r |= BIT(KVM_X86_SW_PROTECTED_VM);
+ r = kvm_caps.supported_vm_types;
break;
default:
break;
@@ -4842,46 +4841,44 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
return r;
}
-static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
-{
- void __user *uaddr = (void __user*)(unsigned long)attr->addr;
-
- if ((u64)(unsigned long)uaddr != attr->addr)
- return ERR_PTR_USR(-EFAULT);
- return uaddr;
-}
-
-static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+static int __kvm_x86_dev_get_attr(struct kvm_device_attr *attr, u64 *val)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
-
- if (attr->group)
+ if (attr->group) {
+ if (kvm_x86_ops.dev_get_attr)
+ return static_call(kvm_x86_dev_get_attr)(attr->group, attr->attr, val);
return -ENXIO;
-
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
+ }
switch (attr->attr) {
case KVM_X86_XCOMP_GUEST_SUPP:
- if (put_user(kvm_caps.supported_xcr0, uaddr))
- return -EFAULT;
+ *val = kvm_caps.supported_xcr0;
return 0;
default:
return -ENXIO;
}
}
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
+ int r;
+ u64 val;
+
+ r = __kvm_x86_dev_get_attr(attr, &val);
+ if (r < 0)
+ return r;
+
+ if (put_user(val, uaddr))
+ return -EFAULT;
+
+ return 0;
+}
+
static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
{
- if (attr->group)
- return -ENXIO;
+ u64 val;
- switch (attr->attr) {
- case KVM_X86_XCOMP_GUEST_SUPP:
- return 0;
- default:
- return -ENXIO;
- }
+ return __kvm_x86_dev_get_attr(attr, &val);
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -5557,11 +5554,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return 0;
}
-static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
- struct kvm_debugregs *dbgregs)
+static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+ struct kvm_debugregs *dbgregs)
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
memset(dbgregs, 0, sizeof(*dbgregs));
BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db));
@@ -5570,6 +5571,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -5577,6 +5579,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
{
unsigned int i;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (dbgregs->flags)
return -EINVAL;
@@ -5597,8 +5603,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
}
-static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
- u8 *state, unsigned int size)
+static int kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
+ u8 *state, unsigned int size)
{
/*
* Only copy state for features that are enabled for the guest. The
@@ -5616,24 +5622,25 @@ static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
XFEATURE_MASK_FPSSE;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size,
supported_xcr0, vcpu->arch.pkru);
+ return 0;
}
-static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
- struct kvm_xsave *guest_xsave)
+static int kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
{
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
- sizeof(guest_xsave->region));
+ return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region,
+ sizeof(guest_xsave->region));
}
static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
guest_xsave->region,
@@ -5641,18 +5648,23 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.pkru);
}
-static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
- struct kvm_xcrs *guest_xcrs)
+static int kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
guest_xcrs->nr_xcrs = 0;
- return;
+ return 0;
}
guest_xcrs->nr_xcrs = 1;
guest_xcrs->flags = 0;
guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+ return 0;
}
static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
@@ -5660,6 +5672,10 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
{
int i, r = 0;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -EINVAL;
@@ -5712,12 +5728,9 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
r = -EFAULT;
@@ -5735,13 +5748,10 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = kvm_get_attr_addr(attr);
+ u64 __user *uaddr = u64_to_user_ptr(attr->addr);
struct kvm *kvm = vcpu->kvm;
int r;
- if (IS_ERR(uaddr))
- return PTR_ERR(uaddr);
-
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
u64 offset, tsc, ns;
@@ -6048,7 +6058,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_GET_DEBUGREGS: {
struct kvm_debugregs dbgregs;
- kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ r = kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, &dbgregs,
@@ -6078,7 +6090,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ r = kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
@@ -6107,7 +6121,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xsave)
break;
- kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ r = kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xsave, size))
@@ -6123,7 +6139,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!u.xcrs)
break;
- kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ r = kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+ if (r < 0)
+ break;
r = -EFAULT;
if (copy_to_user(argp, u.xcrs,
@@ -6267,6 +6285,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_GET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
r = -ENOMEM;
if (!u.sregs2)
@@ -6279,6 +6302,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_SREGS2: {
+ r = -EINVAL;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ goto out;
+
u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
if (IS_ERR(u.sregs2)) {
r = PTR_ERR(u.sregs2);
@@ -9732,6 +9760,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return -EIO;
}
+ memset(&kvm_caps, 0, sizeof(kvm_caps));
+
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
pr_err("failed to allocate cache for x86 emulator\n");
@@ -9750,6 +9780,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
if (r)
goto out_free_percpu;
+ kvm_caps.supported_vm_types = BIT(KVM_X86_DEFAULT_VM);
+ kvm_caps.supported_mce_cap = MCG_CTL_P | MCG_SER_P;
+
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
@@ -9795,6 +9828,9 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+ if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled)
+ kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM);
+
if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
kvm_caps.supported_xss = 0;
@@ -10051,26 +10087,15 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
-int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+ unsigned long a0, unsigned long a1,
+ unsigned long a2, unsigned long a3,
+ int op_64_bit, int cpl)
{
- unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit;
-
- if (kvm_xen_hypercall_enabled(vcpu->kvm))
- return kvm_xen_hypercall(vcpu);
-
- if (kvm_hv_hypercall_enabled(vcpu))
- return kvm_hv_hypercall(vcpu);
-
- nr = kvm_rax_read(vcpu);
- a0 = kvm_rbx_read(vcpu);
- a1 = kvm_rcx_read(vcpu);
- a2 = kvm_rdx_read(vcpu);
- a3 = kvm_rsi_read(vcpu);
+ unsigned long ret;
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- op_64_bit = is_64_bit_hypercall(vcpu);
if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -10079,7 +10104,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 &= 0xFFFFFFFF;
}
- if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
+ if (cpl) {
ret = -KVM_EPERM;
goto out;
}
@@ -10140,18 +10165,49 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ /* stat is incremented on completion. */
return 0;
}
default:
ret = -KVM_ENOSYS;
break;
}
+
out:
+ ++vcpu->stat.hypercalls;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+
+int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr, a0, a1, a2, a3, ret;
+ int op_64_bit;
+ int cpl;
+
+ if (kvm_xen_hypercall_enabled(vcpu->kvm))
+ return kvm_xen_hypercall(vcpu);
+
+ if (kvm_hv_hypercall_enabled(vcpu))
+ return kvm_hv_hypercall(vcpu);
+
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
+ op_64_bit = is_64_bit_hypercall(vcpu);
+ cpl = static_call(kvm_x86_get_cpl)(vcpu);
+
+ ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
+ if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
+ /* MAP_GPA tosses the request to the user space. */
+ return 0;
+
if (!op_64_bit)
ret = (u32)ret;
kvm_rax_write(vcpu, ret);
- ++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -11486,6 +11542,10 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11527,6 +11587,10 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__set_regs(vcpu, regs);
vcpu_put(vcpu);
@@ -11599,6 +11663,10 @@ static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
__get_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11866,6 +11934,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
{
int ret;
+ if (vcpu->kvm->arch.has_protected_state &&
+ vcpu->arch.guest_state_protected)
+ return -EINVAL;
+
vcpu_load(vcpu);
ret = __set_sregs(vcpu, sregs);
vcpu_put(vcpu);
@@ -11983,7 +12055,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12006,7 +12078,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
struct fxregs_state *fxsave;
if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
- return 0;
+ return vcpu->kvm->arch.has_protected_state ? -EINVAL : 0;
vcpu_load(vcpu);
@@ -12532,6 +12604,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;
kvm->arch.vm_type = type;
+ kvm->arch.has_private_mem =
+ (type == KVM_X86_SW_PROTECTED_VM);
ret = kvm_page_track_init(kvm);
if (ret)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a8b71803777b..d80a4c6b5a38 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -24,6 +24,8 @@ struct kvm_caps {
bool has_bus_lock_exit;
/* notify VM exit supported? */
bool has_notify_vmexit;
+ /* bit mask of VM types */
+ u32 supported_vm_types;
u64 supported_mce_cap;
u64 supported_xcr0;